In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.utils import check_random_state
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

In [2]:
house = pd.read_csv('house_prices.csv')
parkinson = pd.read_csv('parkinsons.csv')

In [3]:
# Guardamos los atributos categoricos para pasarlos a numéricos
cat_cols = house.select_dtypes(include=['object']).columns.tolist()
print("Columnas categóricas en house:", cat_cols)

Columnas categóricas en house: ['Condition2', 'LandContour', 'HouseStyle', 'GarageType', 'FireplaceQu', 'Alley', 'MSZoning', 'CentralAir', 'GarageCond', 'Exterior2nd', 'BsmtFinType2', 'GarageQual', 'ExterQual', 'PavedDrive', 'LotShape', 'KitchenQual', 'SaleType', 'BsmtExposure', 'ExterCond', 'BsmtQual', 'MiscFeature', 'PoolQC']


In [4]:
# Codificamos las variables categóricas, ponemos usamos drop_first=True para eliminar la primera categoría de cada variable y evitar la multicolinealidad
house_enc = pd.get_dummies(house, columns=cat_cols, drop_first=True)

Separar features y target en house

In [5]:
# Separamos las variables y la variable objetivo
X_house = house_enc.drop('SalePrice', axis=1)
y_house = house_enc['SalePrice']
print("Dimensiones X_house:", X_house.shape)
print("Dimensiones y_house:", y_house.shape)

Dimensiones X_house: (560, 109)
Dimensiones y_house: (560,)


Para Parkinsons no hay categóricas, solo separamos

In [6]:
X_parkinson = parkinson.drop('total_UPDRS', axis=1)
y_parkinson = parkinson['total_UPDRS']
print("Dimensiones X_parkinson:", X_parkinson.shape)
print("Dimensiones y_parkinson:", y_parkinson.shape)

Dimensiones X_parkinson: (2000, 19)
Dimensiones y_parkinson: (2000,)


Dividimos en datos de entrenamiento y prueba

In [7]:
Xh_train, Xh_test, yh_train, yh_test = train_test_split(
    X_house, y_house, test_size=0.2, random_state=42
)
print("House split:", Xh_train.shape, Xh_test.shape, yh_train.shape, yh_test.shape)

House split: (448, 109) (112, 109) (448,) (112,)


In [8]:
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_parkinson, y_parkinson, test_size=0.2, random_state=42
)
print("Parkinsons split:", Xp_train.shape, Xp_test.shape, yp_train.shape, yp_test.shape)


Parkinsons split: (1600, 19) (400, 19) (1600,) (400,)


In [82]:
class SequentialEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 # Estimador a usar
                 estimator=None,
                 # Número de modelos a entrenar
                 n_estimators=50,
                 # Proporción del tamaño de la datos para cada modelo
                 sample_size=0.8,
                 # Tasa de aprendizaje
                 lr=0.1,
                 # Semilla para la aleatoriedad
                 random_state=None,
                 # Parámetros del estimador como max_depth, min_samples_split, etc.
                 max_depth=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.max_depth = max_depth


    def fit(self, X, y):
        #
        self.init_prediction_ = np.mean(y)
        # Lista para almacenar los modelos entrenados
        self.models = []
        # Guardo el número de muestras
        n_samples = X.shape[0]
        # Inicializamos las predicciones a cero
        pred = np.full(shape=n_samples, fill_value=self.init_prediction_)
        # Inicializamos el generador aleatorio
        rng = check_random_state(self.random_state)
        
        for i in range(self.n_estimators):
            #Calculo el residuo actual
            resid = y - pred
            
            # Creo el submodelo con los datos muestreados y los residuos como objetivos
            model = clone(self.estimator)
            if hasattr(model, "random_state"):
                model.set_params(random_state=self.random_state)
            # Si hay estimador_params, los aplico al modelo
            if self.max_depth is not None:
                if hasattr(model, "max_depth"):
                    model.set_params(max_depth=self.max_depth)

            # Tomo una muesta aleatoria de los datos
            k = int(self.sample_size * n_samples)
            # Como no hay reemplazo, replace=False
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx]
            y_sub = resid.iloc[idx] if hasattr(resid, "iloc") else resid[idx]

            
            model.fit(X_sub, y_sub)
            # Almaceno el modelo entrenado
            self.models.append(model)
            # Actualizo las predicciones
            pred += self.lr * model.predict(X)
        return self

#Devolvemos la prediccion final sumando uno a uno y multiplicando por la tasa de aprendizaje
    def predict(self, X):
        pred = np.full(shape=X.shape[0], fill_value=self.init_prediction_)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred

Evaluar Housing

In [13]:
ens_h = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [14]:
ens_h.fit(Xh_train, yh_train)
yh_pred = ens_h.predict(Xh_test)
print("House R²:", r2_score(yh_test, yh_pred))
print("House MAE:", mean_absolute_error(yh_test, yh_pred))

House R²: 0.7071738896188212
House MAE: 28496.63664458055


Evaluar parkingson

In [15]:
ens_p = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [16]:
ens_p.fit(Xp_train, yp_train)
yp_pred = ens_p.predict(Xp_test)
print("Parkinsons R²:", r2_score(yp_test, yp_pred))
print("Parkinsons MAE:", mean_absolute_error(yp_test, yp_pred))

Parkinsons R²: 0.6690157606392105
Parkinsons MAE: 4.847183397911717


Validación cruzada y busqueda de hiperparametros

In [17]:
# Se dividen los datos en 5 particiones, barajamos los datos y usamos la semilla 42 para reproducibilidad
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    SequentialEnsembleRegressor(
        # Estimador a usar
        estimator=DecisionTreeRegressor(max_depth=3),
        n_estimators=50, sample_size=0.8, lr=0.1, random_state=42
    ),
    Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
    # Se entrena el modelo con 4 de las 5 particiones y se evalua la restante usando R² como métrica
    # Repite el proceso 5 veces, cada vez con una partición diferente como test
    # Pongo n_jobs -1 para usar todos los núcleos disponibles en la CPU
)
print("House CV R²:", scores, "→ Media:", scores.mean())

House CV R²: [0.72356832 0.75864956 0.76608286 0.78143207 0.77203406] → Media: 0.7603533739391801


In [18]:
#Modelo base con hiperparámetros iniciales
base_model = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

In [19]:
scores_house = cross_val_score(
    base_model, Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("House CV R² scores:", scores_house)
print("Media R²:", scores_house.mean())
# Si los modelos se parecen entre si el modelo es estable

House CV R² scores: [0.72356832 0.75864956 0.76608286 0.78143207 0.77203406]
Media R²: 0.7603533739391801


Parkinson

In [20]:
scores_parkinson = cross_val_score(
    base_model, Xp_train, yp_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("Parkinsons CV R² scores:", scores_parkinson)
print("Media R²:", scores_parkinson.mean())

Parkinsons CV R² scores: [0.63276246 0.66040138 0.63794864 0.65623765 0.6812172 ]
Media R²: 0.6537134652991631


Busqueda manual de hiperparámetros

In [79]:
def grid_search_seq_ensemble(X, y, param_grid, cv):
    results = []
    for n in param_grid['n_estimators']:
        for lr in param_grid['lr']:
            for s in param_grid['sample_size']:
                for md in param_grid['max_depth']:
                    model = SequentialEnsembleRegressor(
                        estimator=DecisionTreeRegressor(),
                        n_estimators=n,
                        lr=lr,
                        sample_size=s,
                        random_state=42,
                        max_depth=md
                    )
                    scores = cross_val_score(
                        model, X, y,
                        scoring='r2', cv=cv, n_jobs=-1
                    )
                    results.append({
                        'n_estimators': n,
                        'lr': lr,
                        'sample_size': s,
                        'max_depth': md,
                        'r2_mean': scores.mean()
                    })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values('r2_mean', ascending=False).reset_index(drop=True)
    print("Top 10 combos:\n", df_results.head(10))


In [84]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           100  0.2          0.8          3  0.790717
1           100  0.1          0.6          3  0.790011
2           100  0.2          1.0          3  0.788360
3           100  0.1          1.0          3  0.786865
4           100  0.1          1.0          5  0.784163
5            50  0.2          0.8          3  0.783979
6            50  0.2          1.0          3  0.783529
7           100  0.2          1.0          5  0.782545
8            50  0.2          1.0          5  0.780902
9           100  0.1          0.6          7  0.778543


In [None]:
# Cuanto mayor sea el número de los estimadores mejor, pero también más tiempo de entrenamiento
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)


Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.1          0.6          3  0.798998
1           150  0.1          1.0          3  0.792579
2           150  0.2          0.8          3  0.791567
3           100  0.2          0.8          3  0.790717
4           100  0.1          0.6          3  0.790011
5           150  0.2          1.0          3  0.789513
6           100  0.2          1.0          3  0.788360
7           100  0.1          1.0          3  0.786865
8           150  0.1          1.0          5  0.785773
9           100  0.1          1.0          5  0.784163


In [25]:
# Cuanto mayor la tasa de aprendizaje, más rápido aprende el modelo, pero también puede sobreajustar más rápido
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
# Vamos a probar con una tasa de aprendizaje mas alta

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.1          0.6          3  0.800152
1           150  0.1          0.8          3  0.799826
2           100  0.1          0.6          3  0.795902
3           100  0.1          0.8          3  0.795000
4            50  0.1          0.6          7  0.794500
5           100  0.1          0.6          7  0.793904
6           150  0.1          1.0          3  0.793392
7           150  0.1          0.6          7  0.792711
8           100  0.3          1.0          3  0.789663
9           100  0.1          1.0          3  0.787341


In [23]:
# Con tasa de aprendizaje mayor a 0.3 el modelo empieza a sobreajustar, por lo que es mejor usar una tasa de aprendizaje menor
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3, 0.4],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.3          1.0          3  0.809838
1           100  0.3          1.0          3  0.806977
2            50  0.3          1.0          3  0.802014
3           150  0.1          0.6          3  0.798998
4           150  0.1          1.0          3  0.792579
5           150  0.2          0.8          3  0.791567
6           100  0.2          0.8          3  0.790717
7           100  0.1          0.6          3  0.790011
8           150  0.2          1.0          3  0.789513
9           100  0.2          1.0          3  0.788360


In [24]:
# Con un tamaño de muestra menor, el modelo puede aprender más rápido, pero también puede sobreajustar más rápido
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3, 0.4],
    'sample_size': [0.2, 0.6, 0.8],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
# Como podemos ver, el tamaño de muestra no afecta mucho al rendimiento del modelo, pero si se reduce demasiado, el modelo puede no aprender lo suficiente.

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.1          0.6          3  0.798998
1           150  0.2          0.8          3  0.791567
2           100  0.2          0.8          3  0.790717
3           100  0.1          0.6          3  0.790011
4            50  0.2          0.8          3  0.783979
5           150  0.1          0.6          7  0.778547
6           100  0.1          0.6          7  0.778543
7           150  0.1          0.6          5  0.777164
8           100  0.1          0.6          5  0.776969
9           150  0.1          0.8          3  0.775795


In [23]:
# Con max_depth mayor, el modelo puede aprender más rápido, pero también puede sobreajustar más rápido
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3, 0.4],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [2, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
# Como podemos ver, poniendo un max_depth menor el modelo reduce el sobreajuste, mientras que con un max_depth mayor puede captar patrones más complejos
# pero con mayor riesgo de memorizar ruido.

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.2          0.8          2  0.801028
1           100  0.2          0.8          2  0.793301
2            50  0.2          0.8          2  0.791917
3           150  0.1          0.8          2  0.789721
4           150  0.1          1.0          2  0.786229
5           150  0.1          0.6          2  0.785940
6           150  0.1          1.0          5  0.785773
7           100  0.1          1.0          5  0.784163
8           150  0.2          1.0          5  0.783008
9           100  0.2          1.0          5  0.782545


Evaluación en test con los mejores hiperparámetros

In [None]:
# Top 10 combos:
#     n_estimators   lr  sample_size  max_depth   r2_mean
# 0           150  0.3          1.0          3  0.809838

# Como nuestro dataset de viviendas sólo tiene 560 registros, la partición 80 %–20 % ya deja 448 ejemplos para entrenar. 
# En 5‐fold CV, cada entrenamiento ve sólo 358 filas, lo que resulta insuficiente para estabilizar un ensamble de árboles profundos. 
# Por eso, cuando comparo configuraciones en validación cruzada (por ejemplo, (150, 0.3, 1.0, 3) con r2_mean ≈ 0.81), al evaluar en el test final (112 filas)
#   puede caer a ~0.70 según la semilla y la combinación de hiperparámetros. 
# Esa variación es esperable: con tan pocos datos, pequeñas diferencias en el muestreo afectan mucho el ajuste. 
# Si tuviéramos 5 000 filas, un 20 % de test nos dejaría 4 000 para entrenar y cada pliegue tendría 3 200, con lo que la métrica sería más estable. 
# Pero con 560 filas, incluso cambiar max_depth de 3 a 5 o bajar la tasa de aprendizaje altera fuertemente el patrón de residuos que corrige cada árbol. 
# Por eso, al presentar los resultados debo advertir que la variabilidad se debe principalmente al tamaño reducido del dataset y al alto poder de los árboles frente a pocos ejemplos.


best_house = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=150,
    lr=0.3,
    sample_size=0.8,
    random_state=42,
    max_depth=3
)
best_house.fit(Xh_train, yh_train)
yh_test_pred = best_house.predict(Xh_test)
print("House Test R²:", r2_score(yh_test, yh_test_pred))
print("House Test MAE:", mean_absolute_error(yh_test, yh_test_pred))


House Test R²: 0.7191232968866706
House Test MAE: 29074.149712968017


Busqueda de hiperparametros para parkinson

In [47]:
param_grid = {
    'n_estimators': [ 10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_seq_ensemble(Xp_train, yp_train, param_grid, cv)

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           100  0.1          1.0          7  0.930023
1           100  0.2          1.0          7  0.925988
2            50  0.2          1.0          7  0.924238
3           100  0.1          0.8          7  0.922532
4            50  0.1          1.0          7  0.920539
5           100  0.2          0.8          7  0.918401
6            50  0.2          0.8          7  0.916307
7            50  0.1          0.8          7  0.914560
8           100  0.1          0.6          7  0.904985
9           100  0.2          1.0          5  0.899191


In [48]:
# Top 10 combos:
#     n_estimators   lr  sample_size  max_depth   r2_mean
# 0           100  0.1          1.0          7  0.930023
best_park = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    lr=0.1,
    sample_size=1,
    random_state=42,
    max_depth=7
)
best_park.fit(Xp_train, yp_train)
yp_test_pred = best_park.predict(Xp_test)
print("Parkinsons Test R²:", r2_score(yp_test, yp_test_pred))
print("Parkinsons Test MAE:", mean_absolute_error(yp_test, yp_test_pred))

Parkinsons Test R²: 0.936545781009529
Parkinsons Test MAE: 1.615012580924182


Ahora con EarlyStopping

In [83]:
class SequentialEnsembleRegressorEarlyStopping(BaseEstimator, RegressorMixin):
    def __init__(self, estimator=None, n_estimators=50, sample_size=0.8, lr=0.1, random_state=None, patience=10, max_depth=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.patience = patience
        self.max_depth = max_depth

    def fit(self, X, y, X_val=None, y_val=None):
        # Inicializar lista de modelos
        self.models = []
        # Guardar predicción inicial como media de y
        self.init_prediction_ = np.mean(y)
        n_samples = X.shape[0]
        # Predicciones actuales (comenzamos con ini_prediction_)
        pred = np.full(shape=n_samples, fill_value=self.init_prediction_, dtype=float)
        # Generador aleatorio
        rng = check_random_state(self.random_state)

        best_score = -np.inf
        best_iter = 0
        patience_counter = 0

        # Iterar sobre el número máximo de estimadores
        for m in range(self.n_estimators):
            # Calcular residuos
            resid = y - pred

            # Muestreo sin reemplazo
            k = int(self.sample_size * n_samples)
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx] if hasattr(X, "iloc") else X[idx]
            y_sub = resid[idx] if not hasattr(resid, "iloc") else resid.iloc[idx]

            # Crear y configurar modelo
            model = clone(self.estimator)
            if hasattr(model, "random_state"):
                model.set_params(random_state=self.random_state)
            if self.max_depth is not None and hasattr(model, "max_depth"):
                model.set_params(max_depth=self.max_depth)

            # Entrenar
            model.fit(X_sub, y_sub)
            self.models.append(model)

            # Actualizar predicciones sobre set de entrenamiento
            pred += self.lr * model.predict(X)

            # Early stopping si se proporciona validación
            if X_val is not None and y_val is not None:
                # Calcular predicción en validación
                val_pred = np.full(shape=y_val.shape[0], fill_value=self.init_prediction_, dtype=float)
                for mm in self.models:
                    val_pred += self.lr * mm.predict(X_val)
                score = r2_score(y_val, val_pred)
                if score > best_score:
                    best_score = score
                    best_iter = m
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= self.patience:
                        # Cortar modelos hasta la mejor iteración
                        self.models = self.models[:best_iter+1]
                        break

        return self

    def predict(self, X):
        # Predecir sumando la predicción inicial y contribución de cada modelo
        pred = np.full(shape=X.shape[0], fill_value=self.init_prediction_, dtype=float)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred


In [58]:
es = SequentialEnsembleRegressorEarlyStopping(
    estimator=DecisionTreeRegressor(),
    n_estimators=150,
    sample_size=1,
    lr=0.3,
    random_state=42,
    patience=10,
    max_depth=3
)

es.fit(X_house, y_house, X_val=Xh_test, y_val=yh_test)

yh_es = es.predict(Xh_test)
print("R² con early stopping:", r2_score(yh_test, yh_es))
print("MAE con early stopping:", mean_absolute_error(yh_test, yh_es))

R² con early stopping: 0.9936241094921748
MAE con early stopping: 5443.094420934293


In [None]:
es_p = SequentialEnsembleRegressorEarlyStopping(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    sample_size=1,
    lr=0.1,
    random_state=42,
    patience=10,
    max_depth=7
)
es_p.fit(X_parkinson, y_parkinson, X_val=Xp_test, y_val=yp_test)
yp_es = es_p.predict(Xp_test)
print("R² Parkinsons con early stopping:", r2_score(yp_test, yp_es))
print("MAE Parkinsons con early stopping:", mean_absolute_error(yp_test, yp_es))

R² Parkinsons con early stopping: 0.9968557350793672
MAE Parkinsons con early stopping: 0.39197175101948145


================================

Validación cruzada inicial con LinearRegression

In [None]:
base_lr = SequentialEnsembleRegressor(
    estimator=LinearRegression(fit_intercept=True),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

scores_h_lr = cross_val_score(base_lr, Xh_train, yh_train, 
                              scoring='r2', cv=cv, n_jobs=-1)
scores_p_lr = cross_val_score(base_lr, Xp_train, yp_train, 
                              scoring='r2', cv=cv, n_jobs=-1)

print("House CV R² (LR):", scores_h_lr, "→ media:", scores_h_lr.mean())
print("Parkinsons CV R² (LR):", scores_p_lr, "→ media:", scores_p_lr.mean())
# Para house funciona un poco mejor el linear regression pues no contempla datos muy complejos, simplemente numero de garajes, metros cuadrados, etc.
# Mientras que en parkinson funciona mejor el decision tree pues hay más variabilidad en los datos y es más complejo de modelar con una regresión lineal.

House CV R² (LR): [0.71189146 0.63451049 0.77140362 0.7845024  0.71603573 0.73816302
 0.78719992 0.8385209  0.42194304 0.64213359] → media: 0.7046304172332045
Parkinsons CV R² (LR): [0.14906191 0.24233909 0.23572654 0.21875165 0.15566507 0.07107035
 0.12334617 0.12858158 0.18330935 0.09996638] → media: 0.16078180902492384


Busqueda manual de mejores hiperparámetros

In [74]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'fit_intercept': [True, False]
}

def grid_search(X, y):
    results = []
    for n in param_grid['n_estimators']:
        for lr in param_grid['lr']:
            for s in param_grid['sample_size']:
                for fi in param_grid['fit_intercept']:
                    model = SequentialEnsembleRegressor(
                        estimator=LinearRegression(fit_intercept=fi),
                        n_estimators=n,
                        lr=lr,
                        sample_size=s,
                        random_state=42
                    )
                    scores = cross_val_score(
                        model, X, y, scoring='r2', cv=cv, n_jobs=-1
                    )
                    results.append({
                        'n_estimators': n,
                        'lr': lr,
                        'sample_size': s,
                        'fit_intercept': fi,
                        'r2_mean': scores.mean()
                    })
    return pd.DataFrame(results).sort_values('r2_mean', ascending=False)

df_h_lr = grid_search(Xh_train, yh_train)
df_p_lr = grid_search(Xp_train, yp_train)

print("Top 5 combos House (LR):\n", df_h_lr.head(5))
print("\nTop 5 combos Parkinsons (LR):\n", df_p_lr.head(5))

Top 5 combos House (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
16            10  0.2          1.0           True  0.724462
17            10  0.2          1.0          False  0.709113
28            50  0.1          1.0           True  0.706787
32            50  0.2          0.8           True  0.706090
46           100  0.1          1.0           True  0.705334

Top 5 combos Parkinsons (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
14            10  0.2          0.8           True  0.163951
16            10  0.2          1.0           True  0.163406
12            10  0.2          0.6           True  0.162430
28            50  0.1          1.0           True  0.162171
46           100  0.1          1.0           True  0.161994


Evaluación final del test

In [64]:
#Extraer top-1 de House
best_h = df_h_lr.iloc[0]
best_lr_h = SequentialEnsembleRegressor(
    estimator=LinearRegression(fit_intercept=bool(best_h['fit_intercept'])),
    n_estimators=int(best_h['n_estimators']),
    lr=float(best_h['lr']),
    sample_size=float(best_h['sample_size']),
    random_state=42
)
best_lr_h.fit(Xh_train, yh_train)
yh_lr_pred = best_lr_h.predict(Xh_test)
print("House Test (LR) R²:", r2_score(yh_test, yh_lr_pred),
      "MAE:", mean_absolute_error(yh_test, yh_lr_pred))

House Test (LR) R²: 0.766222639788844 MAE: 25257.28627191641


In [65]:
# Extraer top-1 de Parkinsons
best_p = df_p_lr.iloc[0]
best_lr_p = SequentialEnsembleRegressor(
    estimator=LinearRegression(fit_intercept=bool(best_p['fit_intercept'])),
    n_estimators=int(best_p['n_estimators']),
    lr=float(best_p['lr']),
    sample_size=float(best_p['sample_size']),
    random_state=42
)
best_lr_p.fit(Xp_train, yp_train)
yp_lr_pred = best_lr_p.predict(Xp_test)
print("Parkinsons Test (LR) R²:", r2_score(yp_test, yp_lr_pred),
      "MAE:", mean_absolute_error(yp_test, yp_lr_pred))

Parkinsons Test (LR) R²: 0.14887395829827899 MAE: 8.077265793598144


EarlyStoping

In [72]:
es_lr = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=10,
    sample_size=1.0,      
    lr=0.2,
    random_state=42,
    patience=10,          
)
es_lr.fit(Xh_train, yh_train)
yh_es_lr = es_lr.predict(Xh_test)

print("House (LR + early stopping) R²:", r2_score(yh_test, yh_es_lr))
print("House (LR + early stopping) MAE:", mean_absolute_error(yh_test, yh_es_lr))

House (LR + early stopping) R²: 0.766222639788844
House (LR + early stopping) MAE: 25257.28627191641


In [73]:
es_lr_p = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=10,
    sample_size=0.8,
    lr=0.2,
    random_state=42,
    patience=10
)
es_lr_p.fit(Xp_train, yp_train)
yp_es_lr = es_lr_p.predict(Xp_test)

print("Parkinsons (LR + early stopping) R²:", r2_score(yp_test, yp_es_lr))
print("Parkinsons (LR + early stopping) MAE:", mean_absolute_error(yp_test, yp_es_lr))

Parkinsons (LR + early stopping) R²: 0.14887395829827899
Parkinsons (LR + early stopping) MAE: 8.077265793598144
