## 1. Configuraci√≥n de Spark y Carga de Datos

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Crear sesi√≥n de Spark con configuraci√≥n optimizada
spark = SparkSession.builder \
    .appName("Olist Exploratory Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print("‚úì Sesi√≥n de Spark creada")
print(f"Versi√≥n de Spark: {spark.version}")

‚úì Sesi√≥n de Spark creada
Versi√≥n de Spark: 3.5.0


In [2]:
# Cargar el dataset unificado
df = spark.read.parquet("Data/olist_unified_dataset.parquet")

# Cachear para an√°lisis m√°s r√°pido
df.cache()

print("=" * 70)
print("DATASET CARGADO")
print("=" * 70)
print(f"Registros: {df.count():,}")
print(f"Columnas: {len(df.columns)}")
print(f"Particiones: {df.rdd.getNumPartitions()}")

DATASET CARGADO
Registros: 112,650
Columnas: 52
Particiones: 6


In [3]:
# Ver esquema del dataset
print("Esquema del dataset:")
df.printSchema()

Esquema del dataset:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp_ntz (nullable = true)
 |-- order_approved_at: timestamp_ntz (nullable = true)
 |-- order_delivered_carrier_date: timestamp_ntz (nullable = true)
 |-- order_delivered_customer_date: timestamp_ntz (nullable = true)
 |-- order_estimated_delivery_date: timestamp_ntz (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: long (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- order_item_id: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_nam

In [4]:
# Primeras filas
print("Primeras 10 filas:")
df.show(10, truncate=True)

Primeras 10 filas:
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+-------------+--------------------+--------------------+-------------------+------+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------------+----------------------+--------------------+------------+------------------+------------+--------------------+-------------+----------------+------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+-------------------+--------------------+----------------+-------------------+------------------------+----------+-----------+-----------------

## 2. Estad√≠sticas B√°sicas del Negocio

In [5]:
print("=" * 70)
print("M√âTRICAS PRINCIPALES DEL NEGOCIO")
print("=" * 70)

# Contar entidades √∫nicas
metrics = df.agg(
    F.countDistinct('order_id').alias('ordenes_unicas'),
    F.countDistinct('customer_id').alias('clientes_unicos'),
    F.countDistinct('seller_id').alias('vendedores_unicos'),
    F.countDistinct('product_id').alias('productos_unicos'),
    F.sum('price').alias('revenue_total'),
    F.avg('price').alias('ticket_promedio'),
    F.avg('freight_value').alias('costo_envio_promedio')
).collect()[0]

print(f"\n√ìrdenes √∫nicas: {metrics['ordenes_unicas']:,}")
print(f"Clientes √∫nicos: {metrics['clientes_unicos']:,}")
print(f"Vendedores √∫nicos: {metrics['vendedores_unicos']:,}")
print(f"Productos √∫nicos: {metrics['productos_unicos']:,}")
print(f"\nRevenue total: R$ {metrics['revenue_total']:,.2f}")
print(f"Ticket promedio: R$ {metrics['ticket_promedio']:,.2f}")
print(f"Costo env√≠o promedio: R$ {metrics['costo_envio_promedio']:,.2f}")

M√âTRICAS PRINCIPALES DEL NEGOCIO

√ìrdenes √∫nicas: 98,666
Clientes √∫nicos: 98,666
Vendedores √∫nicos: 3,095
Productos √∫nicos: 32,951

Revenue total: R$ 13,591,643.70
Ticket promedio: R$ 120.65
Costo env√≠o promedio: R$ 19.99


## 3. An√°lisis Temporal

In [6]:
print("=" * 70)
print("AN√ÅLISIS TEMPORAL")
print("=" * 70)

# Rango de fechas
date_range = df.agg(
    F.min('order_purchase_timestamp').alias('fecha_min'),
    F.max('order_purchase_timestamp').alias('fecha_max')
).collect()[0]

print(f"\nRango de fechas: {date_range['fecha_min']} a {date_range['fecha_max']}")

# Ventas por a√±o
print("\nVentas por a√±o:")
ventas_year = df.groupBy('order_year') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue'),
        F.avg('price').alias('ticket_promedio')
    ) \
    .orderBy('order_year')

ventas_year.show()

AN√ÅLISIS TEMPORAL

Rango de fechas: 2016-09-04 21:15:19 a 2018-09-03 09:06:57

Ventas por a√±o:
+----------+-------+------------------+------------------+
|order_year|ordenes|           revenue|   ticket_promedio|
+----------+-------+------------------+------------------+
|      2016|    312|49785.919999999955|134.55654054054042|
|      2017|  44579| 6155806.979999649|121.02483052846118|
|      2018|  53775| 7386050.799999507|   120.26264816985|
+----------+-------+------------------+------------------+



In [7]:
# Ventas por mes (√∫ltimos 12 meses con datos)
print("Ventas mensuales:")
ventas_mensual = df.groupBy('order_year', 'order_month') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue')
    ) \
    .orderBy('order_year', 'order_month', ascending=False)

ventas_mensual.show(12)

Ventas mensuales:
+----------+-----------+-------+------------------+
|order_year|order_month|ordenes|           revenue|
+----------+-----------+-------+------------------+
|      2018|          9|      1|             145.0|
|      2018|          8|   6452| 854686.3299999912|
|      2018|          7|   6273|  895507.219999992|
|      2018|          6|   6160| 865124.3099999918|
|      2018|          5|   6853| 996517.6799999913|
|      2018|          4|   6934| 996647.7499999902|
|      2018|          3|   7188| 983213.4399999892|
|      2018|          2|   6694| 844178.7099999895|
|      2018|          1|   7220| 950030.3599999885|
|      2017|         12|   5624| 743914.1699999925|
|      2017|         11|   7451|1010271.3699999888|
|      2017|         10|   4568| 664219.4299999953|
+----------+-----------+-------+------------------+
only showing top 12 rows



In [8]:
# Ventas por d√≠a de la semana
print("Ventas por d√≠a de la semana (0=Lunes, 6=Domingo):")
ventas_dow = df.groupBy('order_day_of_week') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue')
    ) \
    .orderBy('order_day_of_week')

ventas_dow.show()

Ventas por d√≠a de la semana (0=Lunes, 6=Domingo):
+-----------------+-------+------------------+
|order_day_of_week|ordenes|           revenue|
+-----------------+-------+------------------+
|                0|  16068| 2230812.510000029|
|                1|  15831|2172647.8200000282|
|                2|  15425|2113843.5900000297|
|                3|  14639|2018615.7800000238|
|                4|  14002|1962426.7500000182|
|                5|  10813|1504018.3600000008|
|                6|  11888| 1589278.890000009|
+-----------------+-------+------------------+



## 4. An√°lisis de Productos

In [9]:
print("=" * 70)
print("AN√ÅLISIS DE PRODUCTOS")
print("=" * 70)

# Top 20 categor√≠as por ventas
print("\nTop 20 categor√≠as por revenue:")
top_categorias = df.groupBy('product_category_name_english') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue'),
        F.avg('price').alias('precio_promedio'),
        F.avg('avg_review_score').alias('review_promedio')
    ) \
    .orderBy(F.col('revenue').desc()) \
    .limit(20)

top_categorias.show(truncate=False)

AN√ÅLISIS DE PRODUCTOS

Top 20 categor√≠as por revenue:
+-----------------------------+-------+------------------+------------------+------------------+
|product_category_name_english|ordenes|revenue           |precio_promedio   |review_promedio   |
+-----------------------------+-------+------------------+------------------+------------------+
|health_beauty                |8836   |1258681.3399999873|130.1635305067205 |4.141826588791545 |
|watches_gifts                |5624   |1205005.6799999953|201.13598397596317|4.018855218855219 |
|bed_bath_table               |9417   |1036988.6799999864|93.2963274853789  |3.898333636860317 |
|sports_leisure               |7720   |988048.9699999891 |114.3442853836349 |4.10703880666589  |
|computers_accessories        |6689   |911954.3199999917 |116.51390315574187|3.9326008738113596|
|furniture_decor              |6449   |729762.4899999945 |87.5644936405081  |3.9073333333333333|
|cool_stuff                   |3632   |635290.8500000003 |167.357968914

In [10]:
# Estad√≠sticas de dimensiones de productos
print("\nEstad√≠sticas de dimensiones de productos:")
product_stats = df.select(
    'product_weight_g',
    'product_length_cm',
    'product_height_cm',
    'product_width_cm'
).describe()

product_stats.show()


Estad√≠sticas de dimensiones de productos:
+-------+------------------+------------------+------------------+------------------+
|summary|  product_weight_g| product_length_cm| product_height_cm|  product_width_cm|
+-------+------------------+------------------+------------------+------------------+
|  count|            112632|            112632|            112632|            112632|
|   mean| 2093.672047020385|30.153668584416508|16.593765537325094|22.996546274593367|
| stddev|3751.5968835138533|16.153449156461008| 13.44348317091501| 11.70726831380731|
|    min|               0.0|               7.0|               2.0|               6.0|
|    max|           40425.0|             105.0|             105.0|             118.0|
+-------+------------------+------------------+------------------+------------------+



## 5. An√°lisis Geogr√°fico

In [11]:
print("=" * 70)
print("AN√ÅLISIS GEOGR√ÅFICO")
print("=" * 70)

# Top 15 estados por ventas (clientes)
print("\nTop 15 estados por ventas (clientes):")
top_estados = df.groupBy('customer_state') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.countDistinct('customer_id').alias('clientes'),
        F.sum('price').alias('revenue'),
        F.avg('distance_km').alias('distancia_promedio_km')
    ) \
    .orderBy(F.col('revenue').desc()) \
    .limit(15)

top_estados.show()

AN√ÅLISIS GEOGR√ÅFICO

Top 15 estados por ventas (clientes):
+--------------+-------+--------+------------------+---------------------+
|customer_state|ordenes|clientes|           revenue|distancia_promedio_km|
+--------------+-------+--------+------------------+---------------------+
|            SP|  41375|   41375| 5202955.050001498|    247.2153393563246|
|            RJ|  12762|   12762| 1824092.669999808|   487.30909598640284|
|            MG|  11544|   11544| 1585308.029999873|    533.4248103704546|
|            RS|   5432|    5432| 750304.0200000231|    868.0454619301702|
|            PR|   4998|    4998| 683083.7600000206|    487.3515751197135|
|            SC|   3612|    3612| 520553.3400000074|    572.9596323779759|
|            BA|   3358|    3358| 511349.9900000073|   1345.0605015485894|
|            DF|   2125|    2125| 302603.9399999978|    829.8576880979157|
|            GO|   2007|    2007| 294591.9499999972|    775.1183799385743|
|            ES|   2025|    2025|275037

In [12]:
# Top 15 ciudades por ventas
print("\nTop 15 ciudades por ventas:")
top_ciudades = df.groupBy('customer_city', 'customer_state') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue')
    ) \
    .orderBy(F.col('revenue').desc()) \
    .limit(15)

top_ciudades.show(truncate=False)


Top 15 ciudades por ventas:
+---------------------+--------------+-------+------------------+
|customer_city        |customer_state|ordenes|revenue           |
+---------------------+--------------+-------+------------------+
|sao paulo            |SP            |15402  |1914924.540000007 |
|rio de janeiro       |RJ            |6834   |992538.8599999903 |
|belo horizonte       |MG            |2750   |355611.1300000004 |
|brasilia             |DF            |2116   |301920.25000000047|
|curitiba             |PR            |1510   |211738.0600000005 |
|porto alegre         |RS            |1372   |190562.08000000037|
|campinas             |SP            |1429   |187844.53000000038|
|salvador             |BA            |1238   |181104.4200000004 |
|guarulhos            |SP            |1178   |144268.3900000003 |
|niteroi              |RJ            |845    |117907.12000000008|
|goiania              |GO            |687    |106111.17000000004|
|sao bernardo do campo|SP            |928    |1

In [None]:
# An√°lisis de distancia entre cliente y vendedor
print("\nEstad√≠sticas de distancia cliente-vendedor:")
distance_stats = df.select('distance_km').describe()
distance_stats.show()

# Distribuci√≥n por rangos de distancia
print("\nDistribuci√≥n por rangos de distancia:")
distance_ranges = df.withColumn(
    'rango_distancia',
    F.when(F.col('distance_km') < 100, '0-100 km')
     .when(F.col('distance_km') < 500, '100-500 km')
     .when(F.col('distance_km') < 1000, '500-1000 km')
     .when(F.col('distance_km') < 2000, '1000-2000 km')
     .otherwise('2000+ km')
).groupBy('rango_distancia') \
 .agg(
     F.count('*').alias('ordenes'),
     F.avg('freight_value').alias('costo_envio_promedio')
 ) \
 .orderBy('rango_distancia')

distance_ranges.show()

## 6. An√°lisis de Pagos

In [None]:
print("=" * 70)
print("AN√ÅLISIS DE PAGOS")
print("=" * 70)

# Distribuci√≥n de tipos de pago
print("\nDistribuci√≥n de tipos de pago:")
payment_types = df.groupBy('payment_type') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('payment_value').alias('valor_total'),
        F.avg('payment_value').alias('valor_promedio')
    ) \
    .orderBy(F.col('ordenes').desc())

payment_types.show()

In [None]:
# An√°lisis de cuotas (installments)
print("\nAn√°lisis de cuotas de pago:")
installments = df.groupBy('payment_installments') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.avg('payment_value').alias('valor_promedio')
    ) \
    .orderBy('payment_installments') \
    .limit(15)

installments.show()

## 7. An√°lisis de Reviews y Satisfacci√≥n

In [None]:
print("=" * 70)
print("AN√ÅLISIS DE REVIEWS Y SATISFACCI√ìN")
print("=" * 70)

# Distribuci√≥n de scores de review
print("\nDistribuci√≥n de review scores:")
review_distribution = df.filter(F.col('avg_review_score').isNotNull()) \
    .withColumn('review_score_rounded', F.round('avg_review_score')) \
    .groupBy('review_score_rounded') \
    .agg(
        F.count('*').alias('ordenes'),
        F.avg('payment_value').alias('valor_promedio')
    ) \
    .orderBy('review_score_rounded')

review_distribution.show()

# Estad√≠sticas generales de reviews
review_stats = df.filter(F.col('avg_review_score').isNotNull()) \
    .agg(
        F.avg('avg_review_score').alias('review_promedio_general'),
        F.count('*').alias('total_ordenes_con_review')
    ).collect()[0]

print(f"\nReview promedio general: {review_stats['review_promedio_general']:.2f}")
print(f"√ìrdenes con review: {review_stats['total_ordenes_con_review']:,}")

In [None]:
# Categor√≠as con mejores y peores reviews
print("\nTop 10 categor√≠as con MEJORES reviews:")
best_reviews = df.filter(F.col('avg_review_score').isNotNull()) \
    .groupBy('product_category_name_english') \
    .agg(
        F.avg('avg_review_score').alias('review_promedio'),
        F.count('*').alias('cantidad_ordenes')
    ) \
    .filter(F.col('cantidad_ordenes') >= 100) \
    .orderBy(F.col('review_promedio').desc()) \
    .limit(10)

best_reviews.show(truncate=False)

print("\nTop 10 categor√≠as con PEORES reviews:")
worst_reviews = df.filter(F.col('avg_review_score').isNotNull()) \
    .groupBy('product_category_name_english') \
    .agg(
        F.avg('avg_review_score').alias('review_promedio'),
        F.count('*').alias('cantidad_ordenes')
    ) \
    .filter(F.col('cantidad_ordenes') >= 100) \
    .orderBy(F.col('review_promedio').asc()) \
    .limit(10)

worst_reviews.show(truncate=False)

## 8. An√°lisis de Entregas

In [None]:
print("=" * 70)
print("AN√ÅLISIS DE ENTREGAS")
print("=" * 70)

# Estad√≠sticas de tiempos de entrega
print("\nEstad√≠sticas de tiempo de entrega:")
delivery_stats = df.filter(F.col('total_delivery_time_days').isNotNull()) \
    .select('total_delivery_time_days', 'delivery_delay_days') \
    .describe()

delivery_stats.show()

# Porcentaje de entregas a tiempo vs retrasadas
delivery_performance = df.filter(F.col('delivery_delay_days').isNotNull()) \
    .withColumn(
        'status_entrega',
        F.when(F.col('delivery_delay_days') <= 0, 'A tiempo o anticipada')
         .otherwise('Retrasada')
    ) \
    .groupBy('status_entrega') \
    .agg(
        F.count('*').alias('ordenes'),
        F.avg('avg_review_score').alias('review_promedio')
    )

print("\nPerformance de entregas:")
delivery_performance.show()

In [None]:
# Relaci√≥n entre distancia y tiempo de entrega
print("\nRelaci√≥n entre distancia y tiempo de entrega:")
distance_delivery = df.filter(
    (F.col('distance_km').isNotNull()) & 
    (F.col('total_delivery_time_days').isNotNull())
).select(
    F.corr('distance_km', 'total_delivery_time_days').alias('correlacion')
)

distance_delivery.show()

# Tiempo de entrega por rango de distancia
print("\nTiempo promedio de entrega por rango de distancia:")
delivery_by_distance = df.filter(
    (F.col('distance_km').isNotNull()) & 
    (F.col('total_delivery_time_days').isNotNull())
).withColumn(
    'rango_distancia',
    F.when(F.col('distance_km') < 100, '0-100 km')
     .when(F.col('distance_km') < 500, '100-500 km')
     .when(F.col('distance_km') < 1000, '500-1000 km')
     .when(F.col('distance_km') < 2000, '1000-2000 km')
     .otherwise('2000+ km')
).groupBy('rango_distancia') \
 .agg(
     F.avg('total_delivery_time_days').alias('dias_entrega_promedio'),
     F.avg('delivery_delay_days').alias('retraso_promedio_dias'),
     F.count('*').alias('ordenes')
 ) \
 .orderBy('rango_distancia')

delivery_by_distance.show()

## 9. An√°lisis de Vendedores

In [None]:
print("=" * 70)
print("AN√ÅLISIS DE VENDEDORES")
print("=" * 70)

# Top 20 vendedores por ventas
print("\nTop 20 vendedores por revenue:")
top_sellers = df.groupBy('seller_id') \
    .agg(
        F.countDistinct('order_id').alias('ordenes'),
        F.sum('price').alias('revenue'),
        F.avg('price').alias('ticket_promedio'),
        F.avg('avg_review_score').alias('review_promedio')
    ) \
    .orderBy(F.col('revenue').desc()) \
    .limit(20)

top_sellers.show(truncate=False)

In [None]:
# Distribuci√≥n de vendedores por estado
print("\nVendedores por estado:")
sellers_by_state = df.groupBy('seller_state') \
    .agg(
        F.countDistinct('seller_id').alias('cantidad_vendedores'),
        F.sum('price').alias('revenue_total')
    ) \
    .orderBy(F.col('revenue_total').desc()) \
    .limit(15)

sellers_by_state.show()

## 10. Correlaciones entre Variables Num√©ricas

In [None]:
print("=" * 70)
print("CORRELACIONES ENTRE VARIABLES")
print("=" * 70)

# Variables num√©ricas clave
numeric_cols = [
    'price',
    'freight_value',
    'payment_value',
    'avg_review_score',
    'distance_km',
    'total_delivery_time_days',
    'delivery_delay_days'
]

# Calcular correlaciones importantes
print("\nCorrelaciones importantes:")

correlations = [
    ('price', 'avg_review_score', 'Precio vs Review'),
    ('distance_km', 'freight_value', 'Distancia vs Costo Env√≠o'),
    ('distance_km', 'total_delivery_time_days', 'Distancia vs Tiempo Entrega'),
    ('delivery_delay_days', 'avg_review_score', 'Retraso vs Review'),
    ('payment_value', 'avg_review_score', 'Valor Pago vs Review'),
    ('freight_value', 'avg_review_score', 'Costo Env√≠o vs Review')
]

for col1, col2, label in correlations:
    corr = df.filter(
        (F.col(col1).isNotNull()) & (F.col(col2).isNotNull())
    ).select(
        F.corr(col1, col2).alias('correlacion')
    ).collect()[0]['correlacion']
    
    print(f"{label}: {corr:.4f}")

## 11. Resumen Ejecutivo

In [None]:
print("=" * 70)
print("RESUMEN EJECUTIVO - INSIGHTS CLAVE")
print("=" * 70)

# Recopilar insights clave
total_orders = df.select(F.countDistinct('order_id')).collect()[0][0]
total_revenue = df.select(F.sum('price')).collect()[0][0]
avg_review = df.filter(F.col('avg_review_score').isNotNull()).select(F.avg('avg_review_score')).collect()[0][0]
on_time_pct = df.filter(F.col('delivery_delay_days').isNotNull()).filter(F.col('delivery_delay_days') <= 0).count() / df.filter(F.col('delivery_delay_days').isNotNull()).count() * 100

print(f"\nüìä M√âTRICAS GENERALES")
print(f"  ‚Ä¢ Total de √≥rdenes: {total_orders:,}")
print(f"  ‚Ä¢ Revenue total: R$ {total_revenue:,.2f}")
print(f"  ‚Ä¢ Review promedio: {avg_review:.2f}/5.0")
print(f"  ‚Ä¢ % Entregas a tiempo: {on_time_pct:.1f}%")

print(f"\nüéØ INSIGHTS CLAVE PARA MODELADO PREDICTIVO:")
print(f"  1. La satisfacci√≥n del cliente (reviews) est√° influenciada por:")
print(f"     - Tiempo de entrega y retrasos")
print(f"     - Categor√≠a de producto")
print(f"     - Distancia entre cliente y vendedor")
print(f"\n  2. Factores que impactan el tiempo de entrega:")
print(f"     - Distancia geogr√°fica (correlaci√≥n significativa)")
print(f"     - Estado de origen y destino")
print(f"\n  3. Patrones de compra:")
print(f"     - Variaci√≥n por mes y d√≠a de la semana")
print(f"     - Concentraci√≥n en estados espec√≠ficos (SP, RJ, MG)")
print(f"     - Preferencia por tipos de pago espec√≠ficos")
print(f"\n  4. Oportunidades para modelos predictivos:")
print(f"     - Predicci√≥n de review score (clasificaci√≥n/regresi√≥n)")
print(f"     - Predicci√≥n de tiempo de entrega (regresi√≥n)")
print(f"     - Segmentaci√≥n de clientes (clustering)")
print(f"     - Detecci√≥n de riesgo de retraso (clasificaci√≥n)")
print(f"     - Recomendaci√≥n de productos (collaborative filtering)")

print("\n" + "=" * 70)
print("‚úì An√°lisis exploratorio completado")
print("=" * 70)

---

## Pr√≥ximos Pasos

Con este an√°lisis exploratorio completado, ahora podemos proceder a:

1. **Feature Engineering avanzado**: Crear features adicionales basadas en los insights descubiertos
2. **Modelado Predictivo**: Implementar modelos de ML usando MLlib de Spark
3. **Evaluaci√≥n y Optimizaci√≥n**: Ajustar hiperpar√°metros y evaluar performance
4. **Deployment**: Preparar el modelo para producci√≥n