In [1]:
!pip install -q pyspark

In [2]:
# 2 Paso - Conectarme y crear la sesión en Apache Spark (SparkSession)
from pyspark.sql import SparkSession

# Crear la sesión
spark = SparkSession.builder.appName("ProyectoRetail").config("spark.executor.memory", "1g").config("spark.driver.memory", "1g").config("spark.sql.shuffle.partitions","2").getOrCreate()


In [4]:
#carga de archivo
from google.colab import files
uploaded = files.upload()

Saving online_retail_II.csv to online_retail_II.csv


In [5]:
df = spark.read.option("header", "true") \
               .option("inferSchema", "true") \
               .csv("online_retail_II.csv")

df.printSchema()
df.show(5)

root
 |-- Invoice: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- Price: double (nullable = true)
 |-- Customer ID: double (nullable = true)
 |-- Country: string (nullable = true)

+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|        InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+-------------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|2009-12-01 07:45:00| 6.95|    13085.0|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|2009-12-01 07:45:00| 6.75|    13085.0|United Kingdom|
| 489434|    22041|"RECORD FRAME 7""...|      4

In [7]:
# se crea variable artificialVamos a predecir si un cliente hará una compra grande (label = 1) o no (label = 0)
#según el valor de su factura (Quantity * Price).
from pyspark.sql import functions as F
# Crear columna 'Total' por ítem
df = df.withColumn("Total", F.col("Quantity") * F.col("Price"))

# Agregar por cliente
df_clientes = df.groupBy("Customer ID").agg(
    F.countDistinct("Invoice").alias("num_invoices"),
    F.sum("Quantity").alias("total_qty"),
    F.sum("Total").alias("total_spent"),
    F.countDistinct("Country").alias("num_countries")
)

# Etiqueta: 1 si el cliente gastó más de 1000, 0 en caso contrario
df_clientes = df_clientes.withColumn(
    "label",
    F.when(F.col("total_spent") > 1000, 1).otherwise(0)
)

df_clientes.show(5)

+-----------+------------+---------+------------------+-------------+-----+
|Customer ID|num_invoices|total_qty|       total_spent|num_countries|label|
+-----------+------------+---------+------------------+-------------+-----+
|    12682.0|          24|     5434|11657.589999999998|            1|    1|
|    18087.0|          17|     3857|           10641.5|            1|    1|
|    13635.0|           5|     1192|1877.1899999999996|            1|    1|
|    14110.0|          22|     3370| 7287.949999999995|            1|    1|
|    17519.0|          11|     2177|3263.7300000000005|            1|    1|
+-----------+------------+---------+------------------+-------------+-----+
only showing top 5 rows



In [8]:
#vectorizar
from pyspark.ml.feature import VectorAssembler

feature_cols = ["num_invoices","total_qty","total_spent","num_countries"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

df_ready = assembler.transform(df_clientes).select("features","label")

In [9]:
#regresion logistica
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")

In [10]:
#division entretamiento y prueba
train, test = df_ready.randomSplit([0.8, 0.2], seed=42)

In [11]:
#ajuste del modelo
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.0, 0.01, 0.1])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .build())

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

pipeline = Pipeline(stages=[lr])

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cv_model = cv.fit(train)

In [12]:
pred = cv_model.transform(test)

auc = evaluator.evaluate(pred)
print("AUC-ROC:", auc)

tp = pred.filter("label=1 and prediction=1").count()
fp = pred.filter("label=0 and prediction=1").count()
precision = tp / (tp + fp) if tp+fp>0 else 0
print("Precisión:", precision)

AUC-ROC: 1.0
Precisión: 0.9970674486803519


In [13]:
from pyspark.sql import functions as F

# Contar cada combinación de etiqueta real y predicha
cm = (pred.groupBy("label", "prediction")
          .count()
          .orderBy("label", "prediction"))

cm.show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|  492|
|    0|       1.0|    1|
|    1|       1.0|  340|
+-----+----------+-----+



El modelo obtuvo un AUC-ROC de 1.0 y una precisión cercana al 99.7%, lo que indica un desempeño prácticamente perfecto al distinguir entre clientes de alto y bajo gasto. Esto sugiere que las características seleccionadas (número de compras, cantidad de productos y gasto total) son muy predictivas para esta tarea. Es posible que el dataset esté bien separado entre clases, por lo que sería recomendable evaluar el balance de clases y probar con nuevos datos para confirmar la capacidad de generalización del modelo.