In [59]:
# Importa el meta modelo (El ensamble secuencial)
from meta_model import SequentialEnsembleRegressor

In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.utils import check_random_state
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [61]:
# Cargamos los datasets
house_prices_df = pd.read_csv("../data/house_prices.csv")
parkinsons_df = pd.read_csv("../data/parkinsons.csv")

In [62]:
# Eliminamos filas con valores faltantes
house_prices_df = house_prices_df.dropna()
parkinsons_df = parkinsons_df.dropna()

In [63]:
# Preprocesado house prices
atributos_house = house_prices_df.drop(columns="SalePrice").copy()
objetivo_house = house_prices_df["SalePrice"].copy()

# Codificar atributos con OrdinalEncoder
columnas_categoricas = atributos_house.select_dtypes(include="object").columns
atributos_house[columnas_categoricas] = OrdinalEncoder().fit_transform(atributos_house[columnas_categoricas])

In [64]:
# Preprocesado parkinson
atributos_parkinson = parkinsons_df.drop(columns="total_UPDRS").copy()
objetivo_parkinson = parkinsons_df["total_UPDRS"].copy()

In [65]:
# Explora combinaciones de hiperparámetros para el ensamble secuencial de modelos. Para la experimentación.

def explorar_hiperparametros(estimator_class, param_grid, X, y, cv):
    """
    Parameters:
    - estimator_class: clase del estimador base (ej. DecisionTreeRegressor)
    - param_grid: dict con listas de hiperparámetros a explorar (n_estimators, lr, sample_size, max_depth, etc.)
    - X, y: datos de entrenamiento
    - cv: generador de validación cruzada (KFold)

    Returns:
    - DataFrame con combinaciones y R² medio ordenado descendente
    """

    resultados = []

    n_estimators_list = param_grid.get("n_estimators", [50])
    lr_list = param_grid.get("lr", [0.1])
    sample_size_list = param_grid.get("sample_size", [0.8])
    max_depth_list = param_grid.get("max_depth", [None])

    for n_estimators in n_estimators_list:
        for lr in lr_list:
            for sample_size in sample_size_list:
                for max_depth in max_depth_list:
                    base_estimator = estimator_class()
                    if max_depth is not None:
                        base_estimator.set_params(max_depth=max_depth)

                    model = SequentialEnsembleRegressor(
                        base_estimator=base_estimator,
                        n_estimators=n_estimators,
                        lr=lr,
                        sample_size=sample_size,
                        random_state=42,
                    )

                    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
                    resultados.append({
                        "n_estimators": n_estimators,
                        "lr": lr,
                        "sample_size": sample_size,
                        "max_depth": max_depth,
                        "r2_mean": np.round(r2_scores.mean(), 4)
                    })

    df_resultados = pd.DataFrame(resultados)
    return df_resultados.sort_values(by="r2_mean", ascending=False)

In [66]:
# Definición de la validación cruzada K-Fold
# Divide los datos en 5 particiones, aleatoriamente y con semilla fija para reproducibilidad
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [67]:
# Modelo base DecisionTreeRegressor
modelo_base = DecisionTreeRegressor(max_depth=5, random_state=42)

In [68]:
### DATASET HOUSE PRICES ###

In [69]:
# Modelo base aplicado al dataset de House prices
r2_base_house = cross_val_score(modelo_base, atributos_house, objetivo_house, cv=kf, scoring='r2')
print("Modelo base House prices - R² medio:", np.round(r2_base_house.mean(), 4))

Modelo base House prices - R² medio: 0.5256


In [30]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos. Dataset de House Prices
model = SequentialEnsembleRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=5),
    n_estimators=100,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
)

scores = cross_val_score(model, atributos_house.values, objetivo_house.values, cv=kf, scoring='r2')

print("R² medio House Prices:", np.round(scores.mean(), 4))

R² medio House Prices: 0.7403


In [40]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos con early stopping. Dataset de House Prices
# Hacemos validación cruzada manual para poder ver las iteraciones que se realizan por cada fold al aplicar early stopping

scores = []
n_iters = []  # iteraciones en cada fold

for train_idx, test_idx in kf.split(atributos_house):
    X_train, X_test = atributos_house.values[train_idx], atributos_house.values[test_idx]
    y_train, y_test = objetivo_house.values[train_idx], objetivo_house.values[test_idx]

    model = SequentialEnsembleRegressor(
        base_estimator=DecisionTreeRegressor(max_depth=5),
        n_estimators=100,
        sample_size=0.8,
        lr=0.1,
        random_state=42,
        early_stopping=True,
        patience=10
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(r2_score(y_test, pred))
    n_iters.append(len(model.models_))  # cuántas iteraciones ejecutó

print("R² medio House Prices:", np.round(np.mean(scores), 4))
print("Iteraciones por fold:", n_iters)

R² medio House Prices: 0.7396
Iteraciones por fold: [62, 100, 68, 85, 100]


In [39]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1],
    "sample_size": [0.8],
    "max_depth": [3, 5, 10]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_house.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
4,50,0.1,0.8,5,0.7613
7,100,0.05,0.8,5,0.7558
10,100,0.1,0.8,5,0.7448
1,50,0.05,0.8,5,0.7433
9,100,0.1,0.8,3,0.7424
6,100,0.05,0.8,3,0.7419
3,50,0.1,0.8,3,0.7402
0,50,0.05,0.8,3,0.7313
11,100,0.1,0.8,10,0.7222
2,50,0.05,0.8,10,0.7163


In [41]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1, 0.2],
    "sample_size": [0.8],
    "max_depth": [3, 5, 10]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_house.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
10,100,0.05,0.8,5,0.7634
4,50,0.1,0.8,5,0.7519
13,100,0.1,0.8,5,0.7485
7,50,0.2,0.8,5,0.7465
1,50,0.05,0.8,5,0.7429
9,100,0.05,0.8,3,0.7416
12,100,0.1,0.8,3,0.7305
0,50,0.05,0.8,3,0.7301
16,100,0.2,0.8,5,0.728
3,50,0.1,0.8,3,0.7279


In [42]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1, 0.2],
    "sample_size": [0.6, 0.7, 0.8],
    "max_depth": [3, 5]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_house.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
23,100,0.05,0.8,5,0.7648
7,50,0.1,0.6,5,0.7594
29,100,0.1,0.8,5,0.7588
11,50,0.1,0.8,5,0.7574
17,50,0.2,0.8,5,0.7531
25,100,0.1,0.6,5,0.7528
19,100,0.05,0.6,5,0.7518
21,100,0.05,0.7,5,0.7485
22,100,0.05,0.8,3,0.7421
35,100,0.2,0.8,5,0.7421


In [43]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1],
    "sample_size": [0.6, 0.8],
    "max_depth": [3, 5, 7]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_house.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
16,100,0.05,0.8,5,0.7606
7,50,0.1,0.6,5,0.7595
2,50,0.05,0.6,7,0.7522
13,100,0.05,0.6,5,0.751
1,50,0.05,0.6,5,0.7503
21,100,0.1,0.8,3,0.7488
4,50,0.05,0.8,5,0.7482
10,50,0.1,0.8,5,0.7452
14,100,0.05,0.6,7,0.7436
20,100,0.1,0.6,7,0.7422


In [None]:
### DATASET PARKINSON ###

In [45]:
# Modelo base aplicado al dataset de Parkinson
r2_base_house = cross_val_score(modelo_base, atributos_parkinson, objetivo_parkinson, cv=kf, scoring='r2')
print("Modelo base Parkinson - R² medio:", np.round(r2_base_house.mean(), 4))

Modelo base Parkinson - R² medio: 0.4939


In [46]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos. Dataset de Parkinson
model = SequentialEnsembleRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=5),
    n_estimators=100,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
)

scores = cross_val_score(model, atributos_parkinson.values, objetivo_parkinson.values, cv=kf, scoring='r2')

print("R² medio Parkinson:", np.round(scores.mean(), 4))

R² medio Parkinson: 0.8862


In [54]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos con early stopping. Dataset de Parkinson
# Hacemos validación cruzada manual para poder ver las iteraciones que se realizan por cada fold al aplicar early stopping

scores = []
n_iters = []  # iteraciones en cada fold

for train_idx, test_idx in kf.split(atributos_parkinson):
    X_train, X_test = atributos_parkinson.values[train_idx], atributos_parkinson.values[test_idx]
    y_train, y_test = objetivo_parkinson.values[train_idx], objetivo_parkinson.values[test_idx]

    model = SequentialEnsembleRegressor(
        base_estimator=DecisionTreeRegressor(max_depth=5),
        n_estimators=100,
        sample_size=0.8,
        lr=0.1,
        random_state=42,
        early_stopping=True,
        patience=6
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(r2_score(y_test, pred))
    n_iters.append(len(model.models_))  # cuántas iteraciones ejecutó

print("R² medio Parkinson:", np.round(np.mean(scores), 4))
print("Iteraciones por fold:", n_iters)

R² medio Parkinson: 0.861
Iteraciones por fold: [62, 63, 97, 35, 100]


In [55]:
# Valores a explorar manualmente
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1],
    "sample_size": [0.8],
    "max_depth": [3, 5, 10]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_parkinson.values,
    y=objetivo_parkinson.values,
    cv=kf
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
8,100,0.05,0.8,10,0.9371
11,100,0.1,0.8,10,0.9355
5,50,0.1,0.8,10,0.934
2,50,0.05,0.8,10,0.9207
10,100,0.1,0.8,5,0.8883
7,100,0.05,0.8,5,0.8505
4,50,0.1,0.8,5,0.8459
1,50,0.05,0.8,5,0.7769
9,100,0.1,0.8,3,0.76
3,50,0.1,0.8,3,0.669


In [58]:
# Valores a explorar manualmente
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1, 0.2],
    "sample_size": [0.6, 0.8],
    "max_depth": [7, 10]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_parkinson.values,
    y=objetivo_parkinson.values,
    cv=kf
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
15,100,0.05,0.8,10,0.938
19,100,0.1,0.8,10,0.9364
7,50,0.1,0.8,10,0.9353
13,100,0.05,0.6,10,0.9329
18,100,0.1,0.8,7,0.9297
23,100,0.2,0.8,10,0.9278
17,100,0.1,0.6,10,0.9272
11,50,0.2,0.8,10,0.9254
10,50,0.2,0.8,7,0.9246
5,50,0.1,0.6,10,0.9245


In [57]:
# Valores a explorar manualmente
param_grid = {
    "n_estimators": [50, 100],
    "lr": [0.05, 0.1, 0.2],
    "sample_size": [0.8],
    "max_depth": [10, 13]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    estimator_class=DecisionTreeRegressor,
    param_grid=param_grid,
    X=atributos_parkinson.values,
    y=objetivo_parkinson.values,
    cv=kf
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,max_depth,r2_mean
8,100,0.1,0.8,10,0.9388
6,100,0.05,0.8,10,0.9382
7,100,0.05,0.8,13,0.9371
2,50,0.1,0.8,10,0.9369
9,100,0.1,0.8,13,0.9345
3,50,0.1,0.8,13,0.9343
10,100,0.2,0.8,10,0.9275
4,50,0.2,0.8,10,0.9272
11,100,0.2,0.8,13,0.9261
5,50,0.2,0.8,13,0.9252
