# Práctica 1

## Importaciones

In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, when
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

## Inicio de sesión en Spark

In [None]:
os.environ['JAVA_HOME'] = "C:\Program Files\Java\jdk-17"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
    .appName("AA1") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()
"""# Inicio de la sesión de Spark.
spark = SparkSession.builder \
    .appName("MarathonRegression") \
    .getOrCreate()"""

'# Inicio de la sesión de Spark.\nspark = SparkSession.builder     .appName("MarathonRegression")     .getOrCreate()'

## Visualización inicial de los datos

### Carga del dataset

In [3]:
# Cargar el dataset.
df = spark.read.csv(r"C:\Users\Usuario\Desktop\ALEX\GCED\7º cuatri\aprendizaje automático a gran escala\run_ww_2020_d.csv", header=True, inferSchema=True)

### Muestra de las cinco primeras filas y del esquema

In [4]:
# Mostrar las cinco primeras filas.
df.show(5, truncate=False)

# Mostrar esquema de columnas.
df.printSchema()

+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
|_c0|datetime  |athlete|distance|duration          |gender|age_group|country       |major                  |
+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
|0  |2020-01-01|0      |0.0     |0.0               |F     |18 - 34  |United States |CHICAGO 2019           |
|1  |2020-01-01|1      |5.72    |31.633333333333333|M     |35 - 54  |Germany       |BERLIN 2016            |
|2  |2020-01-01|2      |0.0     |0.0               |M     |35 - 54  |United Kingdom|LONDON 2018,LONDON 2019|
|3  |2020-01-01|3      |0.0     |0.0               |M     |18 - 34  |United Kingdom|LONDON 2017            |
|4  |2020-01-01|4      |8.07    |38.61666666666667 |M     |35 - 54  |United States |BOSTON 2017            |
+---+----------+-------+--------+------------------+------+---------+--------------+-----------------------+
only showing top 5 

### Muestra de filas y variables y principales estadísticas del dataset

In [5]:
# Número de filas y variables.
print(f"Filas: {df.count()}, Variables: {len(df.columns)}")

# Estadísticas descriptivas de variables numéricas.
df.describe(['distance', 'duration']).show()

# Distribución por género y grupo de edad.
df.groupBy("gender").count().show()
df.groupBy("age_group").count().show()

Filas: 13326792, Variables: 9
+-------+-----------------+------------------+
|summary|         distance|          duration|
+-------+-----------------+------------------+
|  count|         13326792|          13326792|
|   mean|3.864718473881684| 21.39197543564929|
| stddev|6.661547347662435| 39.27358918572176|
|    min|              0.0|               0.0|
|    max|           347.95|2299.9666666666667|
+-------+-----------------+------------------+

+------+--------+
|gender|   count|
+------+--------+
|     F| 3253374|
|     M|10073418|
+------+--------+

+---------+-------+
|age_group|  count|
+---------+-------+
|     55 +| 940254|
|  35 - 54|7905966|
|  18 - 34|4480572|
+---------+-------+



## Preparación del dataset

### Filtrado de filas

Como tenemos un gran número de filas, vamos a filtrar. Primero, nos quedamos solo con los registros que no estén vacíos, es decir, aquellos cuya duración y distancia sea mayor a 0.

In [6]:
# Filtrado de filas.
df_filtered = df.filter((df.duration > 0) & (df.distance > 0))

# Comprobar tamaño.
print(f"Filas tras filtrar: {df_filtered.count()}")

Filas tras filtrar: 4581764


### Eliminamos las filas con valores nulos

In [None]:
df_filtered = df_filtered.dropna()

# Comprobar tamaño.
print(f"Filas tras filtrar: {df_filtered.count()}")

Como seguimos teniendo gran número de filas, volvemos a filtrar. Ahora vamos a filtrar por número de atletas. Inicialmente tenemos 36.7k atletas. Vamos a probar con cuantos nos quedamos con el número de filas más adecuado.

In [8]:
# Filtrar atletas con ID < 5000.
df_filtered_5000 = df_filtered.filter(df_filtered.athlete < 5000)
print(f"Filas con athlete < 5000: {df_filtered_5000.count()}")

# Filtrar atletas con ID < 8000.
df_filtered_8000 = df_filtered.filter(df_filtered.athlete < 8000)
print(f"Filas con athlete < 8000: {df_filtered_8000.count()}")

# Filtrar atletas con ID < 10000.
df_filtered_10000 = df_filtered.filter(df_filtered.athlete < 10000)
print(f"Filas con athlete < 10000: {df_filtered_10000.count()}")

# Filtrar atletas con ID < 15000.
df_final = df_filtered.filter(df_filtered.athlete < 15000)
print(f"Filas con athlete < 15000: {df_final.count()}")

# Filtrar atletas con ID < 20000.
df_filtered_20000 = df_filtered.filter(df_filtered.athlete < 20000)
print(f"Filas con athlete < 20000: {df_filtered_20000.count()}")

Filas con athlete < 5000: 625635
Filas con athlete < 8000: 996749
Filas con athlete < 10000: 1249180
Filas con athlete < 15000: 1869208
Filas con athlete < 20000: 2471216


Nos quedamos con 15000 atletas, ya que creemos que es el valor más adecuado para lograr un equilibrio entre cantidad para un correcto aprendizaje y velocidad de procesamiento. 

### Creación de la variable "season" a partir de la variable datetime

In [9]:
df_final = df_final.withColumn(
    "season",
    when((month("datetime") >= 3) & (month("datetime") <= 5), "spring")
    .when((month("datetime") >= 6) & (month("datetime") <= 8), "summer")
    .when((month("datetime") >= 9) & (month("datetime") <= 11), "autumn")
    .otherwise("winter"))

### Seleccionamos solo las columnas relevantes

In [10]:
# Seleccionamos las columnas que no son datetime o major.
df_final = df_final.select("athlete", "distance", "duration", "gender", "age_group", "country", "season")

### Muestra de las cinco primeras filas y del esquema del dataset final 

In [11]:
# Mostrar las cinco primeras filas.
df_final.show(5, truncate=False)

# Mostrar esquema de columnas.
df_final.printSchema()

+-------+--------+------------------+------+---------+--------------+------+
|athlete|distance|duration          |gender|age_group|country       |season|
+-------+--------+------------------+------+---------+--------------+------+
|1      |5.72    |31.633333333333333|M     |35 - 54  |Germany       |winter|
|4      |8.07    |38.61666666666667 |M     |35 - 54  |United States |winter|
|5      |10.09   |43.56666666666667 |F     |35 - 54  |United States |winter|
|6      |9.82    |50.53333333333333 |M     |55 +     |United States |winter|
|8      |10.05   |59.05             |M     |35 - 54  |United Kingdom|winter|
+-------+--------+------------------+------+---------+--------------+------+
only showing top 5 rows
root
 |-- athlete: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- duration: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- country: string (nullable = true)
 |-- season: string (nullable = false)



### Muestra de filas y variables y principales estadísticas del dataset final

In [12]:
# Número de filas y variables.
print(f"Filas: {df_final.count()}, Variables: {len(df_final.columns)}")

# Estadísticas descriptivas de variables numéricas.
df_final.describe(['distance', 'duration']).show()

# Distribución por género, grupo de edad y estación.
df_final.groupBy("gender").count().show()
df_final.groupBy("age_group").count().show()
df_final.groupBy("season").count().show()

Filas: 1869208, Variables: 7
+-------+-----------------+--------------------+
|summary|         distance|            duration|
+-------+-----------------+--------------------+
|  count|          1869208|             1869208|
|   mean| 11.3417861425814|   62.65104727956109|
| stddev|6.851907901598776|   44.45724057163195|
|    min|             0.01|0.016666666666666666|
|    max|           263.37|              2202.0|
+-------+-----------------+--------------------+

+------+-------+
|gender|  count|
+------+-------+
|     F| 463958|
|     M|1405250|
+------+-------+

+---------+-------+
|age_group|  count|
+---------+-------+
|     55 +| 138448|
|  35 - 54|1117572|
|  18 - 34| 613188|
+---------+-------+

+------+------+
|season| count|
+------+------+
|winter|458894|
|spring|507774|
|summer|473599|
|autumn|428941|
+------+------+



## Eliminar valores atípicos

In [13]:
# === Eliminación de valores atípicos ===
# Queremos eliminar actividades con duraciones o distancias poco realistas.

from pyspark.sql.functions import col

# Filtro básico: evitar actividades imposibles (duración o distancia muy baja/alta)
df_final = df_final.filter((col("distance") > 0.5) & (col("distance") < 100)) \
                   .filter((col("duration") > 5) & (col("duration") < 300))

# --- Filtrado adicional por ritmo medio ---
# Calculamos el ritmo medio (min/km) y filtramos actividades con ritmo fuera de [2,6] min/km
df_final = df_final.withColumn("pace_min_km", col("duration") / col("distance")) \
                   .filter((col("pace_min_km") >= 1.0) & (col("pace_min_km") <= 9.0))

# Comprobamos el tamaño tras limpiar outliers
print(f"Filas tras eliminar outliers: {df_final.count()}")

# (Opcional) Revisar estadísticas del nuevo ritmo
df_final.select("pace_min_km").describe().show()

df_final.summary()


Filas tras eliminar outliers: 1806322
+-------+------------------+
|summary|       pace_min_km|
+-------+------------------+
|  count|           1806322|
|   mean| 5.441138410356253|
| stddev| 0.893577322735295|
|    min|1.6812481773111694|
|    max|               9.0|
+-------+------------------+



DataFrame[summary: string, athlete: string, distance: string, duration: string, gender: string, age_group: string, country: string, season: string, pace_min_km: string]

## Preparación de los datos para el entrenamiento

### Convertimos los datos categóricos en vectores one-hot.

In [14]:
# Convertimos las columnas de tipo categórico en índices numéricos. 
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
age_indexer = StringIndexer(inputCol="age_group", outputCol="age_index")
country_indexer = StringIndexer(inputCol="country", outputCol="country_index")
season_indexer = StringIndexer(inputCol="season", outputCol="season_index")

# Convertimos los índices numéricos del paso anterior en vectores one-hot.
encoder = OneHotEncoder(inputCols=["gender_index", "age_index", "country_index", "season_index"],
                        outputCols=["gender_vec", "age_vec", "country_vec", "season_vec"])

### Combinamos las columnas numéricas en un solo vector.

In [15]:
# Creamos un vector features que combine todas las variables numéricas.
assembler = VectorAssembler(inputCols=["distance", "gender_vec", "age_vec", "country_vec", "season_vec"],
                            outputCol="features")


### Creamos un pipeline para aplicar todas las transformaciones

In [16]:
# Creamos el Pipeline con todas las etapas.
pipeline = Pipeline(stages=[gender_indexer, age_indexer, country_indexer, season_indexer, encoder, assembler])

# Ajustamos el pipeline y transformamos los datos.
df_prepared = pipeline.fit(df_final).transform(df_final)

### División de los datos en entrenamiento y test

In [17]:
# Obtenemos los atletas.
athletes = df_final.select("athlete").distinct()

# Asignamos aleatoriamente el 80% de atletas a train y el 20% a test.
train_athletes, test_athletes = athletes.randomSplit([0.8, 0.2], seed=42)

# Filtramos filas según atletas asignados.
train_df = df_prepared.join(train_athletes, on="athlete", how="inner")
test_df = df_prepared.join(test_athletes, on="athlete", how="inner")

# Mostramos el número de filas en train y test.
print(f"Train: {train_df.count()} filas, Test: {test_df.count()} filas")

Train: 1455983 filas, Test: 350339 filas


## Creación de los modelos

In [18]:
from pyspark.sql.functions import col

def add_paces(pred_df, distance_col="distance", y_true_col="label", y_pred_col="prediction"):
    """
    Añade:
      - pace_real_min_km = label / distance
      - pace_pred_min_km = prediction / distance
    Asume duration/label/prediction en MINUTOS y distance en KM.
    """
    return (pred_df
            .withColumn("pace_real_min_km",  col(y_true_col) / col(distance_col))
            .withColumn("pace_pred_min_km",  col(y_pred_col) / col(distance_col)))


### Modelo Regresión Lineal

In [19]:
# Definir la columna objetivo
df_model = df_prepared.withColumnRenamed("duration", "label")

In [20]:

# Asegurarnos de que la columna 'label' existe
train_df = train_df.withColumnRenamed("duration", "label")
test_df = test_df.withColumnRenamed("duration", "label")

# Crear el modelo de regresión lineal
lr = LinearRegression(featuresCol="features", labelCol="label")

# Entrenar el modelo
lr_model = lr.fit(train_df)

# Generar predicciones sobre el conjunto de test
lr_predictions = lr_model.transform(test_df)

# Evaluador para regresión
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Calcular métricas
rmse = evaluator.setMetricName("rmse").evaluate(lr_predictions)
r2 = evaluator.setMetricName("r2").evaluate(lr_predictions)

# Mostrar resultados
print("=== Linear Regression ===")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# (Opcional) Mostrar algunas predicciones reales vs. predichas
lr_predictions.select("athlete", "label", "prediction").show(10, truncate=False)


=== Linear Regression ===
RMSE: 10.51
R²: 0.9015
+-------+------------------+------------------+
|athlete|label             |prediction        |
+-------+------------------+------------------+
|1580   |63.0              |59.29363910423041 |
|5518   |57.45             |68.56256260068136 |
|9852   |46.166666666666664|48.39707813619842 |
|10623  |107.0             |118.51505242802655|
|11141  |48.25             |50.30567996209459 |
|11858  |124.61666666666667|87.08665259588511 |
|1580   |62.0              |52.60782553721557 |
|6397   |47.583333333333336|39.09607233576694 |
|6620   |94.0              |86.13238178973782 |
|9852   |57.53333333333333 |50.86568622248082 |
+-------+------------------+------------------+
only showing top 10 rows


In [21]:
lr_predictions = add_paces(lr_predictions)
lr_predictions.select("athlete", "distance", "label", "prediction", 
                      "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)


+-------+------------------+------------------+------------------+------------------+------------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km  |
+-------+------------------+------------------+------------------+------------------+------------------+
|1580   |11.54             |63.0              |59.29363910423041 |5.459272097053726 |5.138096976103156 |
|5518   |13.200000000000001|57.45             |68.56256260068136 |4.3522727272727275|5.194133530354648 |
|9852   |10.18             |46.166666666666664|48.39707813619842 |4.535036018336608 |4.754133412200238 |
|10623  |20.6              |107.0             |118.51505242802655|5.194174757281553 |5.753157884855657 |
|11141  |8.45              |48.25             |50.30567996209459 |5.710059171597633 |5.953334906756757 |
|11858  |16.54             |124.61666666666667|87.08665259588511 |7.534260378879485 |5.265214788142994 |
|1580   |10.24             |62.0              |52.60782

In [1]:
import plotly.graph_objects as go

# Tomamos una pequeña muestra para visualización
sample_pd = lr_predictions.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Eliminamos posibles valores extremos
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Creamos la figura interactiva
fig = go.Figure()

# Añadimos los puntos de dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="blue"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Configuración del layout
fig.update_layout(
    title="Ritmo real vs. predicho (Linear Regression)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

# Mostrar
fig.show()


NameError: name 'lr_predictions' is not defined

### Modelo de regresión basado en árboles de decisión

In [23]:


# Crear el modelo de árbol de decisión
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label")

# Entrenar el modelo
dt_model = dt.fit(train_df)

# Generar predicciones sobre el conjunto de test
dt_predictions = dt_model.transform(test_df)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

rmse = evaluator.setMetricName("rmse").evaluate(dt_predictions)
r2 = evaluator.setMetricName("r2").evaluate(dt_predictions)

# Mostrar resultados
print("=== Decision Tree Regressor ===")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# Mostrar algunas predicciones
dt_predictions.select("athlete", "label", "prediction").show(10, truncate=False)

=== Decision Tree Regressor ===
RMSE: 12.36
R²: 0.8638
+-------+------------------+------------------+
|athlete|label             |prediction        |
+-------+------------------+------------------+
|1580   |63.0              |58.675084711205095|
|5518   |57.45             |65.04588047390659 |
|9852   |46.166666666666664|52.027873300318575|
|10623  |107.0             |117.47675821889301|
|11141  |48.25             |46.48827375081197 |
|11858  |124.61666666666667|82.88784767860683 |
|1580   |62.0              |52.027873300318575|
|6397   |47.583333333333336|42.93124998401063 |
|6620   |94.0              |90.74458580256983 |
|9852   |57.53333333333333 |58.675084711205095|
+-------+------------------+------------------+
only showing top 10 rows


In [24]:
dt_predictions = add_paces(lr_predictions)
dt_predictions.select("athlete", "distance", "label", "prediction", 
                      "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)


+-------+------------------+------------------+------------------+------------------+------------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km  |
+-------+------------------+------------------+------------------+------------------+------------------+
|1580   |11.54             |63.0              |59.29363910423041 |5.459272097053726 |5.138096976103156 |
|5518   |13.200000000000001|57.45             |68.56256260068136 |4.3522727272727275|5.194133530354648 |
|9852   |10.18             |46.166666666666664|48.39707813619842 |4.535036018336608 |4.754133412200238 |
|10623  |20.6              |107.0             |118.51505242802655|5.194174757281553 |5.753157884855657 |
|11141  |8.45              |48.25             |50.30567996209459 |5.710059171597633 |5.953334906756757 |
|11858  |16.54             |124.61666666666667|87.08665259588511 |7.534260378879485 |5.265214788142994 |
|1580   |10.24             |62.0              |52.60782

In [25]:
import plotly.graph_objects as go

# Tomamos una pequeña muestra para visualización
sample_pd = dt_predictions.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Eliminamos posibles valores extremos (por seguridad)
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Figura interactiva
fig = go.Figure()

# Puntos de dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="green"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Configuración visual
fig.update_layout(
    title="Ritmo real vs. predicho (Decision Tree Regressor)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

# Mostrar figura interactiva
fig.show()


ModuleNotFoundError: No module named 'plotly'

### Modelo de regresión basado en un bosque aleatorio de árboles de decisión

In [26]:
# Crear el modelo de Random Forest
rf = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=70, maxDepth=8, seed=42)



# Entrenar el modelo
rf_model = rf.fit(train_df)

# Generar predicciones sobre el conjunto de test
rf_predictions = rf_model.transform(test_df)

# Evaluador
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Calcular métricas
rmse = evaluator.setMetricName("rmse").evaluate(rf_predictions)
r2 = evaluator.setMetricName("r2").evaluate(rf_predictions)

# Mostrar resultados
print("=== Random Forest Regressor ===")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# Mostrar algunas predicciones
rf_predictions.select("athlete", "label", "prediction").show(10, truncate=False)

=== Random Forest Regressor ===
RMSE: 13.54
R²: 0.8365
+-------+------------------+------------------+
|athlete|label             |prediction        |
+-------+------------------+------------------+
|1580   |63.0              |57.11344550162917 |
|5518   |57.45             |63.10709948784832 |
|9852   |46.166666666666664|52.97158820476551 |
|10623  |107.0             |119.48795765479507|
|11141  |48.25             |43.291078561717825|
|11858  |124.61666666666667|93.2150388384905  |
|1580   |62.0              |55.78131323760753 |
|6397   |47.583333333333336|42.36493431469606 |
|6620   |94.0              |90.24799191192383 |
|9852   |57.53333333333333 |54.48272527797078 |
+-------+------------------+------------------+
only showing top 10 rows


In [27]:
rf_predictions = add_paces(lr_predictions)
rf_predictions.select("athlete", "distance", "label", "prediction", 
                      "pace_real_min_km", "pace_pred_min_km").show(10, truncate=False)


+-------+------------------+------------------+------------------+------------------+------------------+
|athlete|distance          |label             |prediction        |pace_real_min_km  |pace_pred_min_km  |
+-------+------------------+------------------+------------------+------------------+------------------+
|1580   |11.54             |63.0              |59.29363910423041 |5.459272097053726 |5.138096976103156 |
|5518   |13.200000000000001|57.45             |68.56256260068136 |4.3522727272727275|5.194133530354648 |
|9852   |10.18             |46.166666666666664|48.39707813619842 |4.535036018336608 |4.754133412200238 |
|10623  |20.6              |107.0             |118.51505242802655|5.194174757281553 |5.753157884855657 |
|11141  |8.45              |48.25             |50.30567996209459 |5.710059171597633 |5.953334906756757 |
|11858  |16.54             |124.61666666666667|87.08665259588511 |7.534260378879485 |5.265214788142994 |
|1580   |10.24             |62.0              |52.60782

In [None]:
import plotly.graph_objects as go

# Muestra para visualización
sample_pd = rf_predictions.select("pace_real_min_km", "pace_pred_min_km") \
                          .sample(False, 0.03, seed=42).toPandas()

# Filtrado simple de outliers por rango visual
sample_pd = sample_pd[
    (sample_pd["pace_real_min_km"].between(2, 8)) &
    (sample_pd["pace_pred_min_km"].between(2, 8))
]

# Figura interactiva
fig = go.Figure()

# Dispersión
fig.add_trace(go.Scatter(
    x=sample_pd["pace_real_min_km"],
    y=sample_pd["pace_pred_min_km"],
    mode="markers",
    marker=dict(size=6, opacity=0.4, color="purple"),
    name="Predicciones",
    hovertemplate="<b>Ritmo real:</b> %{x:.2f} min/km<br><b>Ritmo predicho:</b> %{y:.2f} min/km"
))

# Línea ideal (y = x)
fig.add_trace(go.Scatter(
    x=[2, 8], y=[2, 8],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Ideal (y = x)"
))

# Layout
fig.update_layout(
    title="Ritmo real vs. predicho (Random Forest Regressor)",
    xaxis_title="Ritmo real (min/km)",
    yaxis_title="Ritmo predicho (min/km)",
    xaxis=dict(range=[2, 8], gridcolor="lightgray"),
    yaxis=dict(range=[2, 8], gridcolor="lightgray"),
    width=800,
    height=600,
    legend=dict(x=0.02, y=0.98),
    template="plotly_white"
)

fig.show()


## Busqueda de hiperparametros

In [28]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

# Evaluador propio
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")



# Modelo base
lr = LinearRegression(featuresCol="features", labelCol="label", maxIter=50)

# Grid propio
lr_grid = (ParamGridBuilder()
           .addGrid(lr.regParam, [0.0, 0.01, 0.1])
           .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  # Ridge / Mixto / Lasso
           .build())

# CV propio
lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_grid,
    evaluator=evaluator_r2,
    numFolds=3,              # baja de 5/10 a 3
    parallelism=1,           # ¡clave para no reventar memoria!
    seed=42,
    collectSubModels=False   # evita guardar todos los submodelos
)


print("CV -> LinearRegression")
lr_cv_model = lr_cv.fit(train_df)

# Evaluación test
lr_preds = lr_cv_model.transform(test_df).cache()
lr_rmse  = evaluator_rmse.evaluate(lr_preds)
lr_r2    = evaluator_r2.evaluate(lr_preds)

# Mejores hiperparámetros
best_lr = lr_cv_model.bestModel
best_lr_params = {p.name: best_lr.getOrDefault(p) for p in best_lr.extractParamMap()}

print(f"[LR] RMSE: {lr_rmse:.4f} | R2: {lr_r2:.4f}")
print("[LR] Mejores hiperparámetros:", {k: best_lr_params[k] for k in ["regParam","elasticNetParam","maxIter"] if k in best_lr_params})

# (Opcional) Coeficientes
print("[LR] #Coef:", len(best_lr.coefficients))


CV -> LinearRegression
[LR] RMSE: 10.5104 | R2: 0.9015
[LR] Mejores hiperparámetros: {'regParam': 0.0, 'elasticNetParam': 0.5, 'maxIter': 50}
[LR] #Coef: 113


In [30]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor



# Modelo base
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label", seed=42)

# Grid propio
dt_grid = (ParamGridBuilder()
           .addGrid(dt.maxDepth, [5, 10, 15])
           .addGrid(dt.minInstancesPerNode, [1, 5, 20])
           .build())

# CV propio
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_grid,
    evaluator=evaluator_r2,
    numFolds=3,
    parallelism=2,
    seed=42
)

print("CV -> DecisionTree")
dt_cv_model = dt_cv.fit(train_df)

# Evaluación test
dt_preds = dt_cv_model.transform(test_df).cache()
dt_rmse  = evaluator_rmse.evaluate(dt_preds)
dt_r2    = evaluator_r2.evaluate(dt_preds)

# Mejores hiperparámetros
best_dt = dt_cv_model.bestModel
best_dt_params = {
    "maxDepth": best_dt.getOrDefault(best_dt.maxDepth),
    "minInstancesPerNode": best_dt.getOrDefault(best_dt.minInstancesPerNode)
}

print(f"[DT] RMSE: {dt_rmse:.4f} | R2: {dt_r2:.4f}")
print("[DT] Mejores hiperparámetros:", best_dt_params)

# (Opcional) Importancias no aplican; es un árbol único. Puedes inspeccionar la profundidad:
print("[DT] Profundidad:", best_dt.depth)


CV -> DecisionTree
[DT] RMSE: 11.8483 | R2: 0.8749
[DT] Mejores hiperparámetros: {'maxDepth': 15, 'minInstancesPerNode': 5}
[DT] Profundidad: 15


In [31]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor




# Modelo base
rf = RandomForestRegressor(featuresCol="features", labelCol="label", seed=42)

# Grid propio
rf_grid = (ParamGridBuilder()
           .addGrid(rf.numTrees, [2, 5, 10]) #Bajar mucho (2, 5, 10)
           .addGrid(rf.maxDepth, [3, 5, 10]) #Bajar
           .addGrid(rf.featureSubsetStrategy, ["auto", "sqrt"])
           .build())

# CV propio
rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_grid,
    evaluator=evaluator_r2,
    numFolds=3,
    parallelism=2,
    seed=42
)

print("CV -> RandomForest")
rf_cv_model = rf_cv.fit(train_df)

# Evaluación test
rf_preds = rf_cv_model.transform(test_df).cache()
rf_rmse  = evaluator_rmse.evaluate(rf_preds)
rf_r2    = evaluator_r2.evaluate(rf_preds)

# Mejores hiperparámetros
best_rf = rf_cv_model.bestModel
best_rf_params = {
    "numTrees": best_rf.getNumTrees,
    "maxDepth": best_rf.getOrDefault(best_rf.maxDepth),
    "featureSubsetStrategy": best_rf.getOrDefault(best_rf.featureSubsetStrategy),
}

print(f"[RF] RMSE: {rf_rmse:.4f} | R2: {rf_r2:.4f}")
print("[RF] Mejores hiperparámetros:", best_rf_params)

# (Opcional) Importancias
print("[RF] Top-10 importancias:", list(best_rf.featureImportances.toArray())[:10])


CV -> RandomForest
[RF] RMSE: 13.4586 | R2: 0.8385
[RF] Mejores hiperparámetros: {'numTrees': 10, 'maxDepth': 10, 'featureSubsetStrategy': 'auto'}
[RF] Top-10 importancias: [0.9808886816449259, 0.0035216388881679855, 0.002011204539472217, 0.00511115388976862, 0.0006734818466692576, 0.0033781870591683694, 0.0006948370038978162, 6.398334661911931e-05, 8.60044855248726e-05, 1.3533457455006735e-05]
