In [None]:
!pip install pyspark findspark

import findspark #para que Python encuentre PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os

#Configurar Spark para que encuentre el entorno
findspark.init()

#Crear sesión

spark=SparkSession.builder \
    .appName("AnalisisVentas") \
    .getOrCreate()

#Verificar que la sesión se haya creado exitosamente
print("SparkSesion creada con éxito")

data=[("Producto A", 100, 15.50, "2023-01-01"),
      ("Producto B", 50, 20.00, "2023-01-01"),
      ("Producto C", 150, 5.00, "2023-01-02"),
      ("Producto A", 20, 15.50, "2023-01-02"),
      ("Producto B", 75, 20.00, "2023-01-03"),
      ("Producto A", 125, 15.50, "2023-01-04"),
      ("Producto C", 200, 5.00, "2023-01-04")
       ]

columnas=["producto", "cantidad", "precio", "fecha"]

#Convertir en un DF de PySpark

df_ventas=spark.createDataFrame(data, columnas)

#Mostrar el contenido del DF
print("Contenido del DF de ventas")
df_ventas.show()

#Filtrado y Agrupamiento
#Filtrar solo las filas donde la columna 'producto' es igual a 'Producto A
df_producto_a=df_ventas.filter(df_ventas.producto=="Producto A")

print("DataFrame filtado para 'Producto A':")
df_producto_a.show()

#Agrupar los datos del 'Producto A' por columna 'fecha' y sumar la columna 'cantidad'
df_agrupado = df_producto_a.groupBy('fecha').sum('cantidad')

print('Ventas totales del "Producto A" por fecha: ')
df_agrupado.show()

#Crear columna ingresos
df_ingresos = df_ventas.withColumn('ingresos', col('cantidad') * col('precio'))

print('DF con la columna "ingresos"')
df_ingresos.show()

#Agrupar el DF por la columna 'producto' y sumar la nueva columna ingresos
df_ingresos_totales = df_ingresos.groupBy('producto').sum('ingresos')

print('Ingresos totales por cada producto')
df_ingresos_totales.show()

#Renombrar la columna de suma
df_ingresos_totales = df_ingresos_totales.withColumnRenamed('sum(ingresos)', 'ingresos totales')

print('DF con el nuevo nombre de la columna')
df_ingresos_totales.show()

#Guardar el DF df_ingresos_totales en un archivo csv
df_ingresos_totales.write.mode('overwrite').option('header', 'true').csv('ingresos_totales_por_producto.csv')

print('Resultados guardados exitosamente')

SparkSesion creada con éxito
Contenido del DF de ventas
+----------+--------+------+----------+
|  producto|cantidad|precio|     fecha|
+----------+--------+------+----------+
|Producto A|     100|  15.5|2023-01-01|
|Producto B|      50|  20.0|2023-01-01|
|Producto C|     150|   5.0|2023-01-02|
|Producto A|      20|  15.5|2023-01-02|
|Producto B|      75|  20.0|2023-01-03|
|Producto A|     125|  15.5|2023-01-04|
|Producto C|     200|   5.0|2023-01-04|
+----------+--------+------+----------+

DataFrame filtado para 'Producto A':
+----------+--------+------+----------+
|  producto|cantidad|precio|     fecha|
+----------+--------+------+----------+
|Producto A|     100|  15.5|2023-01-01|
|Producto A|      20|  15.5|2023-01-02|
|Producto A|     125|  15.5|2023-01-04|
+----------+--------+------+----------+

Ventas totales del "Producto A" por fecha: 
+----------+-------------+
|     fecha|sum(cantidad)|
+----------+-------------+
|2023-01-01|          100|
|2023-01-04|          125|
|2023-

In [None]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

findspark.init()

spark = SparkSession.builder.appName('AnalisisAppMovil').getOrCreate()

print('SparkSession creada con exito.')

SparkSession creada con exito.


In [None]:
data = [('user_1', 'Android', 120),
        ('user_2', 'iOS', 300),
        ('user_3', 'Android', 150),
        ('user_4', 'iOS', 250),
        ('user_5', 'Android', 90),
        ('user_6', 'iOS', 400),
        ('user_7', 'Android', 180),
        ('user_8', 'iOS', 220)]

columns = ['user_ID', 'OS', 'session_duration']

df_app = spark.createDataFrame(data, columns)

Contenido del DF inicial


DataFrame[user_ID: string, OS: string, session_duration: bigint]

### Ejercicio

In [None]:
!pip install pyspark findspark

import findspark
from pyspark.sql import SparkSession

findspark.init()
spark = SparkSession.builder.appName('AnalisisTransacciones').getOrCreate()
spark_context = spark.sparkContext

print('Spark y SparkContext creados exitosamente')

Spark y SparkContext creados exitosamente


In [None]:
datos_csv = spark_context.parallelize([
    'id,usuario,monto,fecha,categoria',
    '1,Juan,1500,2025-03-10,Supermercado',
    '2,Ana,2000,2025-03-12,Tecnología',
    '3,Carlos,500,2025-03-12,Supermercado',
    '4,Juan,3000,2025-03-13,Tecnología',
    '5,Ana,700,2025-03-14,Restaurante'
])

header = datos_csv.first()
header

'id,usuario,monto,fecha,categoria'

In [None]:
rdd_without_header = datos_csv.filter(lambda x: x != header)
rdd_without_header.collect()

['1,Juan,1500,2025-03-10,Supermercado',
 '2,Ana,2000,2025-03-12,Tecnología',
 '3,Carlos,500,2025-03-12,Supermercado',
 '4,Juan,3000,2025-03-13,Tecnología',
 '5,Ana,700,2025-03-14,Restaurante']

In [None]:
rdd_transactions = rdd_without_header.map(lambda x: x.split(',')).map(lambda x: (x[1], float(x[2])))
rdd_transactions.collect()

[('Juan', 1500.0),
 ('Ana', 2000.0),
 ('Carlos', 500.0),
 ('Juan', 3000.0),
 ('Ana', 700.0)]

In [None]:
rdd_filter = rdd_transactions.filter(lambda x: x[1] > 1000)
rdd_filter.collect()

[('Juan', 1500.0), ('Ana', 2000.0), ('Juan', 3000.0)]

In [None]:
rdd_spend = rdd_transactions.reduceByKey(lambda x, y: x + y)
rdd_spend.collect()

[('Juan', 4500.0), ('Ana', 2700.0), ('Carlos', 500.0)]

In [None]:
rdd_top_3 = rdd_spend.sortBy(lambda x: x[1], ascending = False).take(3)
print('Los 3 usuarios con mayores gastos son:')
rdd_top_3

Los 3 usuarios con mayores gastos son:


[('Juan', 4500.0), ('Ana', 2700.0), ('Carlos', 500.0)]

In [None]:
rdd_amounts = rdd_transactions.map(lambda x: x[1])
sum = rdd_amounts.sum()
mean = rdd_amounts.mean()
std = rdd_amounts.stdev()
print(f'La de los montos totales: {sum:.2f}. \nLa media de los montos totales: {mean:.2f}.\nLa desviacion estandar de los montos: {std:.2f}')

La de los montos totales: 7700.00. 
La media de los montos totales: 1540.00.
La desviacion estandar de los montos: 909.07


### Ejercicio

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arshid/iris-flower-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'iris-flower-dataset' dataset.
Path to dataset files: /kaggle/input/iris-flower-dataset


In [None]:
import os
import pandas as pd

os.listdir(path)

dataset_path = os.path.join(path, 'IRIS.csv')

In [None]:
findspark.init()

spark = SparkSession.builder.appName('MLlib_Iris').getOrCreate()

data = spark.read.csv(dataset_path, header = True, inferSchema = True)
data.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when, col

data.select('species').distinct().show()

+---------------+
|        species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [None]:
data = data.withColumn('label', when(col('species') == 'Iris-setosa', 0).when(col('species') == 'Iris-versicolor', 1).otherwise(2))
data.show(5)

+------------+-----------+------------+-----------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|label|
+------------+-----------+------------+-----------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|    0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|    0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|    0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|    0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|    0|
+------------+-----------+------------+-----------+-----------+-----+
only showing top 5 rows



In [None]:
features_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
assembler = VectorAssembler(inputCols = features_columns, outputCol = 'features')
data = assembler.transform(data)
data = data.select('features', 'label')
data.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|    0|
|[4.9,3.0,1.4,0.2]|    0|
|[4.7,3.2,1.3,0.2]|    0|
|[4.6,3.1,1.5,0.2]|    0|
|[5.0,3.6,1.4,0.2]|    0|
+-----------------+-----+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import LogisticRegression

train, test = data.randomSplit([0.8, 0.2], seed = 42)

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = 10)
model = lr.fit(train)
pred = model.transform(test)
pred.select('label', 'prediction').show()

+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       1.0|
|    1|       1.0|
|    1|       1.0|
|    2|       2.0|
|    1|       1.0|
|    2|       2.0|
|    2|       2.0|
+-----+----------+
only showing top 20 rows



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
accuracy = evaluator.evaluate(pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol = 'features', k = 3)
model = kmeans.fit(train)
pred = model.transform(test)
pred.show()

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,3.0,1.3,0.2]|    0|         1|
|[4.6,3.2,1.4,0.2]|    0|         1|
|[4.6,3.6,1.0,0.2]|    0|         1|
|[4.8,3.1,1.6,0.2]|    0|         1|
|[4.9,3.1,1.5,0.1]|    0|         1|
|[5.0,2.3,3.3,1.0]|    1|         2|
|[5.0,3.5,1.3,0.3]|    0|         1|
|[5.1,3.5,1.4,0.2]|    0|         1|
|[5.3,3.7,1.5,0.2]|    0|         1|
|[5.4,3.0,4.5,1.5]|    1|         2|
|[5.4,3.4,1.5,0.4]|    0|         1|
|[5.4,3.7,1.5,0.2]|    0|         1|
|[5.4,3.9,1.7,0.4]|    0|         1|
|[5.5,2.5,4.0,1.3]|    1|         2|
|[5.6,2.9,3.6,1.3]|    1|         2|
|[5.7,2.9,4.2,1.3]|    1|         2|
|[5.8,2.7,5.1,1.9]|    2|         2|
|[6.3,2.5,4.9,1.5]|    1|         2|
|[6.4,3.1,5.5,1.8]|    2|         0|
|[6.5,3.0,5.2,2.0]|    2|         0|
+-----------------+-----+----------+
only showing top 20 rows



In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rajyellow46/wine-quality")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rajyellow46/wine-quality?dataset_version_number=1...


100%|██████████| 98.0k/98.0k [00:00<00:00, 44.5MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/rajyellow46/wine-quality/versions/1





In [3]:
import os
import pandas as pd

os.listdir(path)

dataset_path = os.path.join(path, 'winequalityN.csv')

In [16]:
!pip install pyspark findspark

import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler, StandardScaler, CountVectorizer, IDF, StopWordsRemover, StringIndexer, OneHotEncoder
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.clustering import KMeans, LDA
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline



In [5]:
findspark.init()

spark_session = SparkSession.builder.appName('WineQualityML').getOrCreate()

print('La sesion se he ejecutado exitosamente')

La sesion se he ejecutado exitosamente


In [43]:
df = spark_session.read.csv(dataset_path, header = True, inferSchema = True)
df.printSchema()

root
 |-- type: string (nullable = true)
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [44]:
df.show(5)

+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|white|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|white|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|white|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|white|          7.2|            0.23|       0.32|           8.5

In [45]:
df.withColumnRenamed('quality', 'label').show(5)

+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-----+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|label|
+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-----+
|white|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|    6|
|white|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|    6|
|white|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|    6|
|white|          7.2|            0.23|       0.32|           8.5|    0.058| 

In [46]:
for column in df.columns:
  count = df.filter(col(column).isNull()).count()
  print(f'La columna {column} tiene {count} valores nulos')

La columna type tiene 0 valores nulos
La columna fixed acidity tiene 10 valores nulos
La columna volatile acidity tiene 8 valores nulos
La columna citric acid tiene 3 valores nulos
La columna residual sugar tiene 2 valores nulos
La columna chlorides tiene 2 valores nulos
La columna free sulfur dioxide tiene 0 valores nulos
La columna total sulfur dioxide tiene 0 valores nulos
La columna density tiene 0 valores nulos
La columna pH tiene 9 valores nulos
La columna sulphates tiene 4 valores nulos
La columna alcohol tiene 0 valores nulos
La columna quality tiene 0 valores nulos


In [47]:
df = df.dropna()

for column in df.columns:
  count = df.filter(col(column).isNull()).count()
  print(f'La columna {column} tiene {count} valores nulos')

La columna type tiene 0 valores nulos
La columna fixed acidity tiene 0 valores nulos
La columna volatile acidity tiene 0 valores nulos
La columna citric acid tiene 0 valores nulos
La columna residual sugar tiene 0 valores nulos
La columna chlorides tiene 0 valores nulos
La columna free sulfur dioxide tiene 0 valores nulos
La columna total sulfur dioxide tiene 0 valores nulos
La columna density tiene 0 valores nulos
La columna pH tiene 0 valores nulos
La columna sulphates tiene 0 valores nulos
La columna alcohol tiene 0 valores nulos
La columna quality tiene 0 valores nulos


In [48]:
indexer = StringIndexer(inputCol = 'type', outputCol = 'type_index')
df = indexer.fit(df).transform(df)
df.select('type', 'type_index').show(5)
ohe = OneHotEncoder(inputCol = 'type_index', outputCol = 'type_vector')
df = ohe.fit(df).transform(df)
df.select('type', 'type_index', 'type_vector').show(5)

+-----+----------+
| type|type_index|
+-----+----------+
|white|       0.0|
|white|       0.0|
|white|       0.0|
|white|       0.0|
|white|       0.0|
+-----+----------+
only showing top 5 rows

+-----+----------+-------------+
| type|type_index|  type_vector|
+-----+----------+-------------+
|white|       0.0|(1,[0],[1.0])|
|white|       0.0|(1,[0],[1.0])|
|white|       0.0|(1,[0],[1.0])|
|white|       0.0|(1,[0],[1.0])|
|white|       0.0|(1,[0],[1.0])|
+-----+----------+-------------+
only showing top 5 rows



In [49]:
first_col = df.columns[0]
df = df.drop(first_col)
df = df.withColumnRenamed('quality', 'label')
features_columns = df.columns
features_columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'label',
 'type_index',
 'type_vector']

In [50]:
assembler = VectorAssembler(inputCols = features_columns, outputCol = 'features')
df = assembler.transform(df)
df.select('features').show(5, truncate = False)

+--------------------------------------------------------------------+
|features                                                            |
+--------------------------------------------------------------------+
|[7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0,0.0,1.0]|
|[6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0,0.0,1.0]  |
|[8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0,0.0,1.0] |
|[7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0,0.0,1.0]|
|[7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0,0.0,1.0]|
+--------------------------------------------------------------------+
only showing top 5 rows



In [51]:
ss = StandardScaler(inputCol = 'features', outputCol = 'features_scaled', withStd = True, withMean = False)
model_ss = ss.fit(df)
df_scaled = model_ss.transform(df)
print('Esquema y datos procesados')
df_scaled.select('features', 'features_scaled', 'label').show(5, truncate = False)

Esquema y datos procesados
+--------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                            |features_scaled                                                                                                                                                                                                                                        |label|
+--------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----

In [52]:
train, test = df_scaled.randomSplit([0.8, 0.2], seed = 42)
size_train = train.count()
size_test = test.count()
print(f'Tamaño set de entrenamiento: {size_train}.\nTamaño de set de prueba: {size_test}')

Tamaño set de entrenamiento: 5228.
Tamaño de set de prueba: 1235
