In [1]:
# Importa el meta modelo (El ensamble secuencial)
from meta_model import SequentialEnsembleRegressor

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.utils import check_random_state
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [3]:
# Cargamos los datasets
house_prices_df = pd.read_csv("../data/house_prices.csv")
parkinsons_df = pd.read_csv("../data/parkinsons.csv")

In [4]:
# Eliminamos filas con valores faltantes
house_prices_df = house_prices_df.dropna()
parkinsons_df = parkinsons_df.dropna()

In [5]:
# Preprocesado house prices
atributos_house = house_prices_df.drop(columns="SalePrice").copy()
objetivo_house = house_prices_df["SalePrice"].copy()

# Codificar atributos con OrdinalEncoder
columnas_categoricas = atributos_house.select_dtypes(include="object").columns
atributos_house[columnas_categoricas] = OrdinalEncoder().fit_transform(atributos_house[columnas_categoricas])

In [6]:
# Preprocesado parkinson
atributos_parkinson = parkinsons_df.drop(columns="total_UPDRS").copy()
objetivo_parkinson = parkinsons_df["total_UPDRS"].copy()

In [7]:
# Normalizamos para kNN
normalizador = MinMaxScaler(
    # Cada atributo se normaliza al intervalo [0, 1]
    feature_range=(0, 1)
)

In [8]:
# Normalizar atributos House Prices
atributos_house_norm = atributos_house.copy()
atributos_house_norm[:] = normalizador.fit_transform(atributos_house_norm)

In [9]:
# Normalizar atributos Parkinson
atributos_parkinson_norm = atributos_parkinson.copy()
atributos_parkinson_norm[:] = normalizador.fit_transform(atributos_parkinson_norm)

In [27]:
# Explora combinaciones de hiperparámetros para el ensamble secuencial de modelos. Para la experimentación.
def explorar_hiperparametros_knn(param_grid, X, y, cv):
    """
    Parameters:
    - param_grid: dict con listas de hiperparámetros a explorar (n_estimators, lr, sample_size, n_neighbors, metric)
    - X, y: datos de entrenamiento
    - cv: generador de validación cruzada (KFold)

    Returns:
    - DataFrame con combinaciones y R² medio ordenado descendente
    """

    resultados = []

    n_estimators_list = param_grid.get("n_estimators", [50])
    lr_list = param_grid.get("lr", [0.1])
    sample_size_list = param_grid.get("sample_size", [0.8])
    n_neighbors_list = param_grid.get("n_neighbors", [5])
    metric_list = param_grid.get("metric", ['euclidean'])

    for n_estimators in n_estimators_list:
        for lr in lr_list:
            for sample_size in sample_size_list:
                for n_neighbors in n_neighbors_list:
                    for metric in metric_list:
                        base_estimator = KNeighborsRegressor(
                            n_neighbors=n_neighbors,
                            metric=metric
                        )

                        model = SequentialEnsembleRegressor(
                            base_estimator=base_estimator,
                            n_estimators=n_estimators,
                            lr=lr,
                            sample_size=sample_size,
                            random_state=42,
                        )

                        r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
                        resultados.append({
                            "n_estimators": n_estimators,
                            "lr": lr,
                            "sample_size": sample_size,
                            "n_neighbors": n_neighbors,
                            "metric": metric,
                            "r2_mean": np.round(r2_scores.mean(), 4)
                        })

    df_resultados = pd.DataFrame(resultados)
    return df_resultados.sort_values(by="r2_mean", ascending=False)

In [11]:
# Definición de la validación cruzada K-Fold
# Divide los datos en 5 particiones, aleatoriamente y con semilla fija para reproducibilidad
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# Modelo base: KNeighborsRegressor
modelo_base = KNeighborsRegressor(n_neighbors=7)

In [23]:
### DATASET HOUSE PRICES ###

In [13]:
# Modelo base aplicado al dataset de House prices
r2_base_house = cross_val_score(modelo_base, atributos_house_norm, objetivo_house, cv=kf, scoring='r2')
print("Modelo base House prices - R² medio:", np.round(r2_base_house.mean(), 4))

Modelo base House prices - R² medio: 0.6707


In [22]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos. Dataset de House Prices
model = SequentialEnsembleRegressor(
    base_estimator=KNeighborsRegressor(n_neighbors=7),
    n_estimators=500,
    sample_size=0.8,
    lr=0.005,
    random_state=42,
)

scores = cross_val_score(model, atributos_house_norm.values, objetivo_house.values, cv=kf, scoring='r2')

print("R² medio House Prices:", np.round(scores.mean(), 4))

R² medio House Prices: 0.6836


In [24]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos con early stopping. Dataset de House Prices
# Hacemos validación cruzada manual para poder ver las iteraciones que se realizan por cada fold al aplicar early stopping

scores = []
n_iters = []  # iteraciones en cada fold

for train_idx, test_idx in kf.split(atributos_house_norm):
    X_train, X_test = atributos_house_norm.values[train_idx], atributos_house_norm.values[test_idx]
    y_train, y_test = objetivo_house.values[train_idx], objetivo_house.values[test_idx]

    model = SequentialEnsembleRegressor(
        base_estimator=KNeighborsRegressor(n_neighbors=7),
        n_estimators=500,
        sample_size=0.8,
        lr=0.005,
        random_state=42,
        early_stopping=True,
        patience=10
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(r2_score(y_test, pred))
    n_iters.append(len(model.models_))  # cuántas iteraciones ejecutó

print("R² medio House Prices:", np.round(np.mean(scores), 4))
print("Iteraciones por fold:", n_iters)

R² medio House Prices: 0.117
Iteraciones por fold: [33, 23, 24, 19, 19]


In [26]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [200, 400],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [5, 7],
    "metric": ["euclidean", "manhattan"]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_house_norm.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
19,400,0.005,0.7,7,manhattan,0.6976
11,200,0.01,0.7,7,manhattan,0.6972
23,400,0.005,0.8,7,manhattan,0.6938
15,200,0.01,0.8,7,manhattan,0.693
10,200,0.01,0.7,7,euclidean,0.6863
18,400,0.005,0.7,7,euclidean,0.6862
17,400,0.005,0.7,5,manhattan,0.6856
9,200,0.01,0.7,5,manhattan,0.6849
27,400,0.01,0.7,7,manhattan,0.681
22,400,0.005,0.8,7,euclidean,0.679


In [28]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [300],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [7, 9],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_house_norm.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
5,300,0.01,0.7,9,manhattan,0.7052
7,300,0.01,0.8,9,manhattan,0.7025
4,300,0.01,0.7,7,manhattan,0.7017
6,300,0.01,0.8,7,manhattan,0.692
0,300,0.005,0.7,7,manhattan,0.6676
3,300,0.005,0.8,9,manhattan,0.6666
2,300,0.005,0.8,7,manhattan,0.6664
1,300,0.005,0.7,9,manhattan,0.666


In [29]:
# Valores a explorar. House Prices
param_grid = {
    "n_estimators": [300],
    "lr": [0.01, 0.015],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [7, 9],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de House prices y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_house_norm.values,
    y=objetivo_house.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
1,300,0.01,0.7,9,manhattan,0.7052
3,300,0.01,0.8,9,manhattan,0.7025
0,300,0.01,0.7,7,manhattan,0.7017
2,300,0.01,0.8,7,manhattan,0.692
5,300,0.015,0.7,9,manhattan,0.6726
7,300,0.015,0.8,9,manhattan,0.6655
4,300,0.015,0.7,7,manhattan,0.6625
6,300,0.015,0.8,7,manhattan,0.6397


In [30]:
### DATASET PARKINSON ###

In [31]:
# Modelo base aplicado al dataset de Parkinson
r2_base_parkinson = cross_val_score(modelo_base, atributos_parkinson_norm, objetivo_parkinson, cv=kf, scoring='r2')
print("Modelo base Parkinson - R² medio:", np.round(r2_base_parkinson.mean(), 4))

Modelo base Parkinson - R² medio: 0.5155


In [37]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos. Dataset de Parkinson
model = SequentialEnsembleRegressor(
    base_estimator=KNeighborsRegressor(n_neighbors=7),
    n_estimators=200,
    sample_size=0.8,
    lr=0.01,
    random_state=42,
)

scores = cross_val_score(model, atributos_parkinson_norm.values, objetivo_parkinson.values, cv=kf, scoring='r2')

print("R² medio Parkinson:", np.round(scores.mean(), 4))

R² medio Parkinson: 0.5042


In [39]:
# Entrenamiento y validación cruzada con ensamble secuencial de modelos con early stopping. Dataset de Parkinson
# Hacemos validación cruzada manual para poder ver las iteraciones que se realizan por cada fold al aplicar early stopping

scores = []
n_iters = []  # iteraciones en cada fold

for train_idx, test_idx in kf.split(atributos_parkinson_norm):
    X_train, X_test = atributos_parkinson_norm.values[train_idx], atributos_parkinson_norm.values[test_idx]
    y_train, y_test = objetivo_parkinson.values[train_idx], objetivo_parkinson.values[test_idx]

    model = SequentialEnsembleRegressor(
        base_estimator=KNeighborsRegressor(n_neighbors=7),
        n_estimators=200,
        sample_size=0.8,
        lr=0.01,
        random_state=42,
        early_stopping=True,
        patience=20
    )

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(r2_score(y_test, pred))
    n_iters.append(len(model.models_))  # cuántas iteraciones ejecutó

print("R² medio Parkinson:", np.round(np.mean(scores), 4))
print("Iteraciones por fold:", n_iters)

R² medio House Prices: 0.4366
Iteraciones por fold: [111, 177, 77, 107, 63]


In [42]:
# Valores a explorar. Parkinson
param_grid = {
    "n_estimators": [200, 400],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [5, 7],
    "metric": ["euclidean", "manhattan"]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_parkinson_norm.values,
    y=objetivo_parkinson.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
9,200,0.01,0.7,5,manhattan,0.5763
17,400,0.005,0.7,5,manhattan,0.5754
21,400,0.005,0.8,5,manhattan,0.5716
13,200,0.01,0.8,5,manhattan,0.5714
11,200,0.01,0.7,7,manhattan,0.57
19,400,0.005,0.7,7,manhattan,0.5698
23,400,0.005,0.8,7,manhattan,0.5688
15,200,0.01,0.8,7,manhattan,0.5688
5,200,0.005,0.8,5,manhattan,0.5103
16,400,0.005,0.7,5,euclidean,0.507


In [43]:
# Valores a explorar. Parkinson
param_grid = {
    "n_estimators": [300],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [3, 5],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_parkinson_norm.values,
    y=objetivo_parkinson.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
0,300,0.005,0.7,3,manhattan,0.5716
2,300,0.005,0.8,3,manhattan,0.565
3,300,0.005,0.8,5,manhattan,0.5631
1,300,0.005,0.7,5,manhattan,0.5607
5,300,0.01,0.7,5,manhattan,0.541
7,300,0.01,0.8,5,manhattan,0.5181
4,300,0.01,0.7,3,manhattan,0.517
6,300,0.01,0.8,3,manhattan,0.4848


In [45]:
# Valores a explorar. Parkinson
param_grid = {
    "n_estimators": [200, 300],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [3],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_parkinson_norm.values,
    y=objetivo_parkinson.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
2,200,0.01,0.7,3,manhattan,0.5755
4,300,0.005,0.7,3,manhattan,0.5716
5,300,0.005,0.8,3,manhattan,0.565
3,200,0.01,0.8,3,manhattan,0.5612
0,200,0.005,0.7,3,manhattan,0.5237
1,200,0.005,0.8,3,manhattan,0.5237
6,300,0.01,0.7,3,manhattan,0.517
7,300,0.01,0.8,3,manhattan,0.4848


In [51]:
# Valores a explorar. Parkinson
param_grid = {
    "n_estimators": [200],
    "lr": [0.008],
    "sample_size": [0.7, 0.8],
    "n_neighbors": [3, 5],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_parkinson_norm.values,
    y=objetivo_parkinson.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
0,200,0.008,0.7,3,manhattan,0.5755
3,200,0.008,0.8,5,manhattan,0.5678
2,200,0.008,0.8,3,manhattan,0.5673
1,200,0.008,0.7,5,manhattan,0.567


In [46]:
# Valores a explorar. Parkinson
param_grid = {
    "n_estimators": [200],
    "lr": [0.005, 0.01],
    "sample_size": [0.7, 0.8, 0.9],
    "n_neighbors": [3, 5],
    "metric": ["manhattan"]
}

# Entrenamiento y validación cruzada con dataset de Parkinson y diferentes hiperparámetros
df_resultados = explorar_hiperparametros(
    param_grid=param_grid,
    X=atributos_parkinson_norm.values,
    y=objetivo_parkinson.values,
    cv=kf,
)

# Mostrar resultados
df_resultados

Unnamed: 0,n_estimators,lr,sample_size,n_neighbors,metric,r2_mean
7,200,0.01,0.7,5,manhattan,0.5763
6,200,0.01,0.7,3,manhattan,0.5755
9,200,0.01,0.8,5,manhattan,0.5714
11,200,0.01,0.9,5,manhattan,0.5646
8,200,0.01,0.8,3,manhattan,0.5612
10,200,0.01,0.9,3,manhattan,0.5369
0,200,0.005,0.7,3,manhattan,0.5237
2,200,0.005,0.8,3,manhattan,0.5237
4,200,0.005,0.9,3,manhattan,0.5197
5,200,0.005,0.9,5,manhattan,0.5162
