# Práctica 1

## Importaciones

In [2]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, when, col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import plotly.graph_objects as go

## Inicio de sesión en Spark

In [3]:
os.environ['JAVA_HOME'] = r"C:\Program Files\Java\jdk-22"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

## Visualización inicial de los datos

### Carga del dataset

In [4]:
# Cargar el dataset.
df = spark.read.csv("run_ww_2020_d.csv", header=True, inferSchema=True)

### Muestra de las cinco primeras filas y del esquema

In [5]:
# Mostrar las cinco primeras filas.
df.show(5, truncate=False)

# Mostrar esquema de columnas.
df.printSchema()

+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
|_c0|datetime  |athlete|distance|duration          |gender|age_group|country       |major                  |
+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
|0  |2020-01-01|0      |0.0     |0.0               |F     |18 - 34  |United States |CHICAGO 2019           |
|1  |2020-01-01|1      |5.72    |31.633333333333333|M     |35 - 54  |Germany       |BERLIN 2016            |
|2  |2020-01-01|2      |0.0     |0.0               |M     |35 - 54  |United Kingdom|LONDON 2018,LONDON 2019|
|3  |2020-01-01|3      |0.0     |0.0               |M     |18 - 34  |United Kingdom|LONDON 2017            |
|4  |2020-01-01|4      |8.07    |38.61666666666667 |M     |35 - 54  |United States |BOSTON 2017            |
+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
only showing top 5 

### Muestra de filas y variables y principales estadísticas del dataset

In [6]:
# Número de filas y variables.
print(f"Filas: {df.count()}, Variables: {len(df.columns)}")

# Estadísticas descriptivas de variables numéricas.
df.describe(['distance', 'duration']).show()

# Distribución por género y grupo de edad.
df.groupBy("gender").count().show()
df.groupBy("age_group").count().show()

Filas: 13326792, Variables: 9
+-------+------------------+------------------+
|summary|          distance|          duration|
+-------+------------------+------------------+
|  count|          13326792|          13326792|
|   mean|3.8647184738824425|21.391975435649456|
| stddev| 6.661547347662418| 39.27358918572197|
|    min|               0.0|               0.0|
|    max|            347.95|2299.9666666666667|
+-------+------------------+------------------+

+------+--------+
|gender|   count|
+------+--------+
|     F| 3253374|
|     M|10073418|
+------+--------+

+---------+-------+
|age_group|  count|
+---------+-------+
|     55 +| 940254|
|  35 - 54|7905966|
|  18 - 34|4480572|
+---------+-------+



## Preparación del dataset

### Filtrado de filas

Como tenemos un gran número de filas, vamos a filtrar. Primero, nos quedamos solo con los registros que no estén vacíos, es decir, aquellos cuya duración y distancia sea mayor a 0.

In [7]:
# Filtrado de filas.
df_filtered = df.filter((df.duration > 0) & (df.distance > 0))

# Comprobar tamaño.
print(f"Filas tras filtrar: {df_filtered.count()}")

Filas tras filtrar: 4581764


### Eliminamos las filas con valores nulos

In [8]:
df_filtered = df_filtered.dropna()

# Comprobar tamaño.
print(f"Filas tras filtrar: {df_filtered.count()}")

Filas tras filtrar: 4555724


Como seguimos teniendo gran número de filas, volvemos a filtrar. Ahora vamos a filtrar por número de atletas. Inicialmente tenemos 36.7k atletas. Vamos a probar con cuantos nos quedamos con el número de filas más adecuado.

In [9]:
# Filtrar atletas con ID < 5000.
df_filtered_5000 = df_filtered.filter(df_filtered.athlete < 5000)
print(f"Filas con athlete < 5000: {df_filtered_5000.count()}")

# Filtrar atletas con ID < 8000.
df_filtered_8000 = df_filtered.filter(df_filtered.athlete < 8000)
print(f"Filas con athlete < 8000: {df_filtered_8000.count()}")

# Filtrar atletas con ID < 10000.
df_filtered_10000 = df_filtered.filter(df_filtered.athlete < 10000)
print(f"Filas con athlete < 10000: {df_filtered_10000.count()}")

# Filtrar atletas con ID < 15000.
df_final = df_filtered.filter(df_filtered.athlete < 15000)
print(f"Filas con athlete < 15000: {df_final.count()}")

# Filtrar atletas con ID < 20000.
df_filtered_20000 = df_filtered.filter(df_filtered.athlete < 20000)
print(f"Filas con athlete < 20000: {df_filtered_20000.count()}")

Filas con athlete < 5000: 625635
Filas con athlete < 8000: 996749
Filas con athlete < 10000: 1249180
Filas con athlete < 15000: 1869208
Filas con athlete < 20000: 2471216


Nos quedamos con 15000 atletas, ya que creemos que es el valor más adecuado para lograr un equilibrio entre cantidad para un correcto aprendizaje y velocidad de procesamiento. 

### Creación de la variable "season" a partir de la variable datetime

In [10]:
df_final = df_final.withColumn(
    "season",
    when((month("datetime") >= 3) & (month("datetime") <= 5), "spring")
    .when((month("datetime") >= 6) & (month("datetime") <= 8), "summer")
    .when((month("datetime") >= 9) & (month("datetime") <= 11), "autumn")
    .otherwise("winter"))

### Seleccionamos solo las columnas relevantes

In [11]:
# Seleccionamos las columnas que no son datetime o major.
df_final = df_final.select("athlete", "distance", "duration", "gender", "age_group", "country", "season")

### Muestra de las cinco primeras filas y del esquema del dataset

In [12]:
# Mostrar las cinco primeras filas.
df_final.show(5, truncate=False)

# Mostrar esquema de columnas.
df_final.printSchema()

+-------+--------+------------------+------+---------+--------------+------+
|athlete|distance|duration          |gender|age_group|country       |season|
+-------+--------+------------------+------+---------+--------------+------+
|1      |5.72    |31.633333333333333|M     |35 - 54  |Germany       |winter|
|4      |8.07    |38.61666666666667 |M     |35 - 54  |United States |winter|
|5      |10.09   |43.56666666666667 |F     |35 - 54  |United States |winter|
|6      |9.82    |50.53333333333333 |M     |55 +     |United States |winter|
|8      |10.05   |59.05             |M     |35 - 54  |United Kingdom|winter|
+-------+--------+------------------+------+---------+--------------+------+
only showing top 5 rows
root
 |-- athlete: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- duration: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- country: string (nullable = true)
 |-- season: string (nullable = false)



### Muestra de filas y variables y principales estadísticas del dataset

In [13]:
# Número de filas y variables.
print(f"Filas: {df_final.count()}, Variables: {len(df_final.columns)}")

# Estadísticas descriptivas de variables numéricas.
df_final.describe(['distance', 'duration']).show()

# Distribución por género, grupo de edad y estación.
df_final.groupBy("gender").count().show()
df_final.groupBy("age_group").count().show()
df_final.groupBy("season").count().show()

Filas: 1869208, Variables: 7
+-------+-----------------+--------------------+
|summary|         distance|            duration|
+-------+-----------------+--------------------+
|  count|          1869208|             1869208|
|   mean|11.34178614258103|  62.651047279560196|
| stddev|6.851907901598767|   44.45724057163191|
|    min|             0.01|0.016666666666666666|
|    max|           263.37|              2202.0|
+-------+-----------------+--------------------+

+------+-------+
|gender|  count|
+------+-------+
|     F| 463958|
|     M|1405250|
+------+-------+

+---------+-------+
|age_group|  count|
+---------+-------+
|     55 +| 138448|
|  35 - 54|1117572|
|  18 - 34| 613188|
+---------+-------+

+------+------+
|season| count|
+------+------+
|winter|458894|
|spring|507774|
|summer|473599|
|autumn|428941|
+------+------+



### Eliminar valores atípicos (distancia, duración y ritmo medio)

In [14]:
# Filtramos primero por distancia. Eliminamos actividades con distancia < 0.5 km o > 100 km.
df_final = df_final.filter((col("distance") > 0.5) & (col("distance") < 100)) 

# Luego filtramos por duración. Eliminamos actividades con duración < 5 min o > 300 min.
df_final = df_final.filter((col("duration") > 5) & (col("duration") < 300)) 

# Hacemos un filtrado adicional por ritmo medio.
# Primero añadimos la columna pace_min_km (min/km) como el resultado de la duración entre la distancia.
df_final = df_final.withColumn("pace_min_km", col("duration") / col("distance")) 

# Filtramos actividades con ritmo medio < 1 min/km o > 9 min/km.
df_final = df_final.filter((col("pace_min_km") >= 1.0) & (col("pace_min_km") <= 9.0))

### Muestra de las cinco primeras filas y del esquema del dataset final

In [15]:
# Mostrar las cinco primeras filas.
df_final.show(5, truncate=False)

# Mostrar esquema de columnas.
df_final.printSchema()

+-------+--------+------------------+------+---------+--------------+------+-----------------+
|athlete|distance|duration          |gender|age_group|country       |season|pace_min_km      |
+-------+--------+------------------+------+---------+--------------+------+-----------------+
|1      |5.72    |31.633333333333333|M     |35 - 54  |Germany       |winter|5.53030303030303 |
|4      |8.07    |38.61666666666667 |M     |35 - 54  |United States |winter|4.785212722015696|
|5      |10.09   |43.56666666666667 |F     |35 - 54  |United States |winter|4.317806408985795|
|6      |9.82    |50.53333333333333 |M     |55 +     |United States |winter|5.145960624575696|
|8      |10.05   |59.05             |M     |35 - 54  |United Kingdom|winter|5.875621890547263|
+-------+--------+------------------+------+---------+--------------+------+-----------------+
only showing top 5 rows
root
 |-- athlete: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- duration: double (nullable = tr

### Muestra de filas y variables y principales estadísticas del dataset final

In [16]:
# Número de filas y variables.
print(f"Filas: {df_final.count()}, Variables: {len(df_final.columns)}")

# Estadísticas descriptivas de variables numéricas.
df_final.describe(['distance', 'duration', 'pace_min_km']).show()

# Distribución por género, grupo de edad y estación.
df_final.groupBy("gender").count().show()
df_final.groupBy("age_group").count().show()
df_final.groupBy("season").count().show()

Filas: 1806322, Variables: 8
+-------+------------------+-----------------+------------------+
|summary|          distance|         duration|       pace_min_km|
+-------+------------------+-----------------+------------------+
|  count|           1806322|          1806322|           1806322|
|   mean|11.326626417106976|60.57503812535387| 5.441138410356235|
| stddev| 6.204587315338015|33.55932521375634|0.8935773227352968|
|    min|              0.57|5.016666666666667|1.6812481773111694|
|    max|             88.53|           299.95|               9.0|
+-------+------------------+-----------------+------------------+

+------+-------+
|gender|  count|
+------+-------+
|     F| 440092|
|     M|1366230|
+------+-------+

+---------+-------+
|age_group|  count|
+---------+-------+
|     55 +| 129521|
|  35 - 54|1078807|
|  18 - 34| 597994|
+---------+-------+

+------+------+
|season| count|
+------+------+
|winter|446493|
|spring|491814|
|summer|454843|
|autumn|413172|
+------+------+



## Preparación de los datos para el entrenamiento

### Convertimos los datos categóricos en vectores one-hot

In [17]:
# Convertimos las columnas de tipo categórico en índices numéricos. 
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
age_indexer = StringIndexer(inputCol="age_group", outputCol="age_index")
country_indexer = StringIndexer(inputCol="country", outputCol="country_index")
season_indexer = StringIndexer(inputCol="season", outputCol="season_index")

# Convertimos los índices numéricos del paso anterior en vectores one-hot.
encoder = OneHotEncoder(inputCols=["gender_index", "age_index", "country_index", "season_index"],
                        outputCols=["gender_vec", "age_vec", "country_vec", "season_vec"])

### Combinamos las columnas numéricas en un solo vector

In [18]:
# Creamos un vector features que combine todas las variables numéricas.
assembler = VectorAssembler(inputCols=["distance", "gender_vec", "age_vec", "country_vec", "season_vec"],
                            outputCol="features")

### Creamos un pipeline para aplicar todas las transformaciones

In [19]:
# Creamos el Pipeline con todas las etapas.
pipeline = Pipeline(stages=[gender_indexer, age_indexer, country_indexer, season_indexer, encoder, assembler])

# Ajustamos el pipeline y transformamos los datos.
df_prepared = pipeline.fit(df_final).transform(df_final)

### División de los datos en entrenamiento y test

In [20]:
# Obtenemos los atletas.
athletes = df_final.select("athlete").distinct()

# Asignamos aleatoriamente el 80% de atletas a train y el 20% a test.
train_athletes, test_athletes = athletes.randomSplit([0.8, 0.2], seed=42)

# Filtramos filas según atletas asignados.
train_df = df_prepared.join(train_athletes, on="athlete", how="inner")
test_df = df_prepared.join(test_athletes, on="athlete", how="inner")

# Mostramos el número de filas en train y test.
print(f"Train: {train_df.count()} filas, Test: {test_df.count()} filas")

Train: 1455983 filas, Test: 350339 filas


### Definición de la variable objetivo a predecir

In [21]:
# Renombramos la columna 'duration' a 'label', ya que es la variable a predecir.
train_df = train_df.withColumnRenamed("duration", "label")  
test_df = test_df.withColumnRenamed("duration", "label")    

## Modelos

### Función para calcular ritmos medios 

In [22]:
# Función para añadir columnas de ritmo (min/km) al DataFrame de predicciones.
def add_paces(pred_df, distance_col="distance", y_true_col="label", y_pred_col="prediction"):
  
    return (pred_df
            .withColumn("pace_real_min_km",  col(y_true_col) / col(distance_col))  
            .withColumn("pace_pred_min_km",  col(y_pred_col) / col(distance_col))) 

### Modelo Regresión Lineal

#### Creación del modelo

In [23]:
# Preparar evaluadores
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

# Definir el modelo base
lr = LinearRegression(featuresCol="features", labelCol="label", maxIter=50)  

# Definir la grid de hiperparámetros
lr_grid = (ParamGridBuilder()
           .addGrid(lr.regParam, [0.0, 0.001, 0.01, 0.1])        
           .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) 
           .build())

# Configurar CrossValidator
lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_grid,
    evaluator=evaluator_r2, 
    numFolds=3,             
    parallelism=1,          
    seed=42,
    collectSubModels=False)

# Entrenar directamente el mejor modelo sobre train
print("Buscando los mejores hiperparámetros con CrossValidator...")
lr_cv_model = lr_cv.fit(train_df)

# Evaluación sobre test
lr_preds = lr_cv_model.transform(test_df).cache()

lr_rmse = evaluator_rmse.evaluate(lr_preds)
lr_r2   = evaluator_r2.evaluate(lr_preds)

print(f"\n=== Linear Regression (Mejor Modelo) ===")
print(f"RMSE: {lr_rmse:.4f}")
print(f"R²: {lr_r2:.4f}")

# Mejor modelo e hiperparámetros
best_lr = lr_cv_model.bestModel
best_params = {p.name: best_lr.getOrDefault(p) for p in best_lr.extractParamMap()}

print("[LR] Mejores hiperparámetros:")
for k in ["regParam", "elasticNetParam", "maxIter"]:
    if k in best_params:
        print(f"   {k}: {best_params[k]}")

print(f"[LR] Número de coeficientes: {len(best_lr.coefficients)}")

Buscando los mejores hiperparámetros con CrossValidator...

=== Linear Regression (Mejor Modelo) ===
RMSE: 10.5110
R²: 0.9015
[LR] Mejores hiperparámetros:
   regParam: 0.001
   elasticNetParam: 1.0
   maxIter: 50
[LR] Número de coeficientes: 113


#### Añadir ritmos y ver resultados

In [24]:
# Añadir columnas de ritmo (min/km)
lr_preds = add_paces(lr_preds)

# Mostrar algunas predicciones con ritmos
lr_preds.select("athlete", "distance", "label", "prediction", 
                "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)

+-------+------------------+------------------+------------------+------------------+------------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km  |
+-------+------------------+------------------+------------------+------------------+------------------+
|1580   |11.54             |63.0              |59.306256071333564|5.459272097053726 |5.139190300808801 |
|5518   |13.200000000000001|57.45             |68.56728011690655 |4.3522727272727275|5.194490917947465 |
|9852   |10.18             |46.166666666666664|48.408025626862944|4.535036018336608 |4.755208804210506 |
|10623  |20.6              |107.0             |118.49682114573461|5.194174757281553 |5.752272871152165 |
|11141  |8.45              |48.25             |50.306698198278305|5.710059171597633 |5.9534554080802735|
|11858  |16.54             |124.61666666666667|87.06601033869286 |7.534260378879485 |5.263966767756521 |
|1580   |10.24             |62.0              |52.62069

#### Resultados gráficos

In [25]:
# Tomamos una pequeña muestra para visualización
sample_pd = lr_preds.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Eliminamos posibles valores extremos
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Creamos la figura interactiva
fig = go.Figure()

# Añadimos los puntos de dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="blue"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Configuración del layout
fig.update_layout(
    title="Ritmo real vs. predicho (Linear Regression)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

# Mostrar
fig.show()

fig.write_html("grafico1.html")



### Modelo de Regresión basado en Árboles de Decisión

#### Creación del modelo

In [None]:
# Preparar evaluadores
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

# Definir el modelo base
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label", seed=42)

# Definir la grid de hiperparámetros
dt_grid = (ParamGridBuilder()
           .addGrid(dt.maxDepth, [5, 10, 15])                
           .addGrid(dt.minInstancesPerNode, [1, 5, 20])     
           .build())

# Configurar CrossValidator
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_grid,
    evaluator=evaluator_r2, 
    numFolds=3,
    parallelism=2,           
    seed=42,
    collectSubModels=False)

# Entrenar directamente el mejor modelo sobre train
print("Buscando los mejores hiperparámetros con CrossValidator para Decision Tree...")
dt_cv_model = dt_cv.fit(train_df)

# Evaluación sobre test
dt_preds = dt_cv_model.transform(test_df).cache()

dt_rmse = evaluator_rmse.evaluate(dt_preds)
dt_r2   = evaluator_r2.evaluate(dt_preds)

print(f"\n=== Decision Tree Regressor (Mejor Modelo) ===")
print(f"RMSE: {dt_rmse:.4f}")
print(f"R²: {dt_r2:.4f}")

# Mejor modelo e hiperparámetros
best_dt = dt_cv_model.bestModel
best_dt_params = {
    "maxDepth": best_dt.getOrDefault(best_dt.maxDepth),
    "minInstancesPerNode": best_dt.getOrDefault(best_dt.minInstancesPerNode)}

print("[DT] Mejores hiperparámetros:", best_dt_params)
print(f"[DT] Profundidad del árbol: {best_dt.depth}")

Buscando los mejores hiperparámetros con CrossValidator para Decision Tree...


#### Añadir ritmos y ver resultados

In [None]:
# Añadir columnas de ritmo (min/km)
dt_preds = add_paces(dt_preds)

# Mostrar algunas predicciones con ritmos
dt_preds.select("athlete", "distance", "label", "prediction", 
                "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)

+-------+------------------+------------------+------------------+------------------+-----------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km |
+-------+------------------+------------------+------------------+------------------+-----------------+
|1580   |11.54             |63.0              |59.37502536856212 |5.459272097053726 |5.145149512007117|
|5518   |13.200000000000001|57.45             |70.1823876805548  |4.3522727272727275|5.316847551557181|
|9852   |10.18             |46.166666666666664|50.158694632132146|4.535036018336608 |4.927180219266419|
|10623  |20.6              |107.0             |132.89422572178478|5.194174757281553 |6.451176005911882|
|11141  |8.45              |48.25             |47.968574181117525|5.710059171597633 |5.676754340960654|
|11858  |16.54             |124.61666666666667|87.71721119796887 |7.534260378879485 |5.303338040989654|
|1580   |10.24             |62.0              |53.66815704030261

#### Resultados gráficos

In [None]:
# Tomamos una pequeña muestra para visualización
sample_pd = dt_preds.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Eliminamos posibles valores extremos (por seguridad)
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Figura interactiva
fig = go.Figure()

# Puntos de dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="green"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Configuración visual
fig.update_layout(
    title="Ritmo real vs. predicho (Decision Tree Regressor)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

# Mostrar figura interactiva
fig.show()


### Modelo de Regresión basado en un Bosque Aleatorio de Árboles de Decisión

#### Creación del modelo

In [None]:
# Preparar evaluadores
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

# Definir el modelo base
rf = RandomForestRegressor(featuresCol="features", labelCol="label", seed=42)

# Definir la grid de hiperparámetros
rf_grid = (ParamGridBuilder()
           .addGrid(rf.numTrees, [2, 5, 10])                     
           .addGrid(rf.maxDepth, [3, 5, 10])                    
           .addGrid(rf.featureSubsetStrategy, ["auto", "sqrt"]) 
           .build())

# Configurar CrossValidator
rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_grid,
    evaluator=evaluator_r2,  
    numFolds=3,
    parallelism=2,           
    seed=42,
    collectSubModels=False)

# Entrenar directamente el mejor modelo sobre train
print("Buscando los mejores hiperparámetros con CrossValidator para Random Forest...")
rf_cv_model = rf_cv.fit(train_df)

# Evaluación sobre test
rf_preds = rf_cv_model.transform(test_df).cache()

rf_rmse = evaluator_rmse.evaluate(rf_preds)
rf_r2   = evaluator_r2.evaluate(rf_preds)

print(f"\n=== Random Forest Regressor (Mejor Modelo) ===")
print(f"RMSE: {rf_rmse:.4f}")
print(f"R²: {rf_r2:.4f}")

# Mejor modelo e hiperparámetros
best_rf = rf_cv_model.bestModel
best_rf_params = {
    "numTrees": best_rf.getNumTrees,
    "maxDepth": best_rf.getOrDefault(best_rf.maxDepth),
    "featureSubsetStrategy": best_rf.getOrDefault(best_rf.featureSubsetStrategy),}

print("[RF] Mejores hiperparámetros:", best_rf_params)

# Importancia de features (Top-10)
print("[RF] Top-10 importancias:", list(best_rf.featureImportances.toArray())[:10])

Buscando los mejores hiperparámetros con CrossValidator para Random Forest...

=== Random Forest Regressor (Mejor Modelo) ===
RMSE: 13.4586
R²: 0.8385
[RF] Mejores hiperparámetros: {'numTrees': 10, 'maxDepth': 10, 'featureSubsetStrategy': 'auto'}
[RF] Top-10 importancias: [0.9808886816449259, 0.0035216388881679855, 0.002011204539472217, 0.00511115388976862, 0.0006734818466692576, 0.0033781870591683694, 0.0006948370038978162, 6.398334661911931e-05, 8.60044855248726e-05, 1.3533457455006735e-05]


#### Añadir ritmos y ver resultados

In [None]:
# Añadir columnas de ritmo (min/km)
rf_preds = add_paces(rf_preds)

# Mostrar algunas predicciones con ritmos
rf_preds.select("athlete", "distance", "label", "prediction", 
                "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)

+-------+------------------+------------------+------------------+------------------+------------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km  |
+-------+------------------+------------------+------------------+------------------+------------------+
|1580   |11.54             |63.0              |56.214063236550224|5.459272097053726 |4.871235982370037 |
|5518   |13.200000000000001|57.45             |61.71832781635184 |4.3522727272727275|4.675630895178169 |
|9852   |10.18             |46.166666666666664|52.921150120519584|4.535036018336608 |5.198541269206246 |
|10623  |20.6              |107.0             |125.37528675996575|5.194174757281553 |6.086178968930375 |
|11141  |8.45              |48.25             |44.73745254667521 |5.710059171597633 |5.2943730824467705|
|11858  |16.54             |124.61666666666667|89.56592042069715 |7.534260378879485 |5.415110061710831 |
|1580   |10.24             |62.0              |54.60829

#### Resultados gráficos

In [None]:
# Muestra para visualización
sample_pd = rf_preds.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Filtrado simple de outliers por rango visual
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Figura interactiva
fig = go.Figure()

# Dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="purple"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Layout
fig.update_layout(
    title="Ritmo real vs. predicho (Random Forest Regressor)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

fig.show()
