In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.utils import check_random_state
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

In [42]:
house = pd.read_csv('house_prices.csv')
parkinson = pd.read_csv('parkinsons.csv')

In [43]:
# Guardamos los atributos categoricos para pasarlos a numéricos
cat_cols = house.select_dtypes(include=['object']).columns.tolist()
print("Columnas categóricas en house:", cat_cols)

Columnas categóricas en house: ['Condition2', 'LandContour', 'HouseStyle', 'GarageType', 'FireplaceQu', 'Alley', 'MSZoning', 'CentralAir', 'GarageCond', 'Exterior2nd', 'BsmtFinType2', 'GarageQual', 'ExterQual', 'PavedDrive', 'LotShape', 'KitchenQual', 'SaleType', 'BsmtExposure', 'ExterCond', 'BsmtQual', 'MiscFeature', 'PoolQC']


In [44]:
# Codificamos las variables categóricas, ponemos usamos drop_first=True para eliminar la primera categoría de cada variable y evitar la multicolinealidad
house_enc = pd.get_dummies(house, columns=cat_cols, drop_first=True)

Separar features y target en house

In [45]:
# Separamos las variables y la variable objetivo
X_house = house_enc.drop('SalePrice', axis=1)
y_house = house_enc['SalePrice']
print("Dimensiones X_house:", X_house.shape)
print("Dimensiones y_house:", y_house.shape)

Dimensiones X_house: (560, 109)
Dimensiones y_house: (560,)


Para Parkinsons no hay categóricas, solo separamos

In [46]:
X_parkinson = parkinson.drop('total_UPDRS', axis=1)
y_parkinson = parkinson['total_UPDRS']
print("Dimensiones X_parkinson:", X_parkinson.shape)
print("Dimensiones y_parkinson:", y_parkinson.shape)

Dimensiones X_parkinson: (2000, 19)
Dimensiones y_parkinson: (2000,)


Dividimos en datos de entrenamiento y prueba

In [47]:
Xh_train, Xh_test, yh_train, yh_test = train_test_split(
    X_house, y_house, test_size=0.2, random_state=42
)
print("House split:", Xh_train.shape, Xh_test.shape, yh_train.shape, yh_test.shape)

House split: (448, 109) (112, 109) (448,) (112,)


In [48]:
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_parkinson, y_parkinson, test_size=0.2, random_state=42
)
print("Parkinsons split:", Xp_train.shape, Xp_test.shape, yp_train.shape, yp_test.shape)


Parkinsons split: (1600, 19) (400, 19) (1600,) (400,)


In [92]:
class SequentialEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 # Estimador a usar
                 estimator=None,
                 # Número de modelos a entrenar
                 n_estimators=50,
                 # Proporción del tamaño de la datos para cada modelo
                 sample_size=0.8,
                 # Tasa de aprendizaje
                 lr=0.1,
                 # Semilla para la aleatoriedad
                 random_state=None,
                 # Parámetros del estimador como max_depth, min_samples_split, etc.
                 max_depth=None):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.max_depth = max_depth


    def fit(self, X, y):
        #
        self.init_prediction_ = np.mean(y)
        # Lista para almacenar los modelos entrenados
        self.models = []
        # Guardo el número de muestras
        n_samples = X.shape[0]
        # Inicializamos las predicciones a cero
        pred = np.full(shape=n_samples, fill_value=self.init_prediction_)
        # Inicializamos el generador aleatorio
        rng = check_random_state(self.random_state)
        
        for i in range(self.n_estimators):
            #Calculo el residuo actual
            resid = y - pred
            
            # Creo el submodelo con los datos muestreados y los residuos como objetivos
            model = clone(self.estimator)
            if hasattr(model, "random_state"):
                model.set_params(random_state=self.random_state)
            # Si hay estimador_params, los aplico al modelo
            if self.max_depth is not None:
                model.set_params(max_depth=self.max_depth)

            # Tomo una muesta aleatoria de los datos
            k = int(self.sample_size * n_samples)
            # Como no hay reemplazo, replace=False
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx]
            y_sub = resid.iloc[idx] if hasattr(resid, "iloc") else resid[idx]

            
            model.fit(X_sub, y_sub)
            # Almaceno el modelo entrenado
            self.models.append(model)
            # Actualizo las predicciones
            pred += self.lr * model.predict(X)
        return self

#Devolvemos la prediccion final sumando uno a uno y multiplicando por la tasa de aprendizaje
    def predict(self, X):
        pred = np.full(shape=X.shape[0], fill_value=self.init_prediction_)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred

Evaluar Housing

In [10]:
ens_h = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [11]:
ens_h.fit(Xh_train, yh_train)
yh_pred = ens_h.predict(Xh_test)
print("House R²:", r2_score(yh_test, yh_pred))
print("House MAE:", mean_absolute_error(yh_test, yh_pred))

House R²: 0.6951221002784407
House MAE: 29233.46451825404


Evaluar parkingson

In [12]:
ens_p = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [13]:
ens_p.fit(Xp_train, yp_train)
yp_pred = ens_p.predict(Xp_test)
print("Parkinsons R²:", r2_score(yp_test, yp_pred))
print("Parkinsons MAE:", mean_absolute_error(yp_test, yp_pred))

Parkinsons R²: 0.6699741811362365
Parkinsons MAE: 4.825034392514188


Validación cruzada y busqueda de hiperparametros

In [88]:
# Se dividen los datos en 5 particiones, barajamos los datos y usamos la semilla 42 para reproducibilidad
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    SequentialEnsembleRegressor(
        # Estimador a usar
        estimator=DecisionTreeRegressor(max_depth=3),
        n_estimators=50, sample_size=0.8, lr=0.1, random_state=42
    ),
    Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
    # Se entrena el modelo con 4 de las 5 particiones y se evalua la restante usando R² como métrica
    # Repite el proceso 5 veces, cada vez con una partición diferente como test
    # Pongo n_jobs -1 para usar todos los núcleos disponibles en la CPU
)
print("House CV R²:", scores, "→ Media:", scores.mean())

House CV R²: [0.72356832 0.75864956 0.76608286 0.78143207 0.77203406] → Media: 0.7603533739391801


In [68]:
#Modelo base con hiperparámetros iniciales
base_model = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

In [16]:
scores_house = cross_val_score(
    base_model, Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("House CV R² scores:", scores_house)
print("Media R²:", scores_house.mean())
# Si los modelos se parecen entre si el modelo es estable

House CV R² scores: [0.72353283 0.77689812 0.77236508 0.78746294 0.77894553]
Media R²: 0.7678408998095679


Parkinson

In [17]:
scores_parkinson = cross_val_score(
    base_model, Xp_train, yp_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("Parkinsons CV R² scores:", scores_parkinson)
print("Media R²:", scores_parkinson.mean())

Parkinsons CV R² scores: [0.64242144 0.65989829 0.63023465 0.65740071 0.67936077]
Media R²: 0.6538631707613632


Busqueda manual de hiperparámetros

In [None]:
def grid_search_seq_ensemble(X, y, param_grid, cv):
    results = []
    for n in param_grid['n_estimators']:
        for lr in param_grid['lr']:
            for s in param_grid['sample_size']:
                for md in param_grid['max_depth']:
                    model = SequentialEnsembleRegressor(
                        estimator=DecisionTreeRegressor(),
                        n_estimators=n,
                        lr=lr,
                        sample_size=s,
                        random_state=42,
                        max_depth=md
                    )
                    scores = cross_val_score(
                        model, X, y,
                        scoring='r2', cv=cv, n_jobs=-1
                    )
                    results.append({
                        'n_estimators': n,
                        'lr': lr,
                        'sample_size': s,
                        'max_depth': md,
                        'r2_mean': scores.mean()
                    })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values('r2_mean', ascending=False).reset_index(drop=True)
    print("Top 10 combos:\n", df_results.head(10))


In [None]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           100  0.2          0.8          3  0.790717
1           100  0.1          0.6          3  0.790011
2           100  0.2          1.0          3  0.788360
3           100  0.1          1.0          3  0.786865
4           100  0.1          1.0          5  0.784163
5            50  0.2          0.8          3  0.783979
6            50  0.2          1.0          3  0.783529
7           100  0.2          1.0          5  0.782545
8            50  0.2          1.0          5  0.780902
9           100  0.1          0.6          7  0.778543


In [None]:
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
# Cuanto mas número de estimadores mejor, pero también más tiempo de entrenamiento


Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.1          0.6          3  0.798998
1           150  0.1          1.0          3  0.792579
2           150  0.2          0.8          3  0.791567
3           100  0.2          0.8          3  0.790717
4           100  0.1          0.6          3  0.790011
5           150  0.2          1.0          3  0.789513
6           100  0.2          1.0          3  0.788360
7           100  0.1          1.0          3  0.786865
8           150  0.1          1.0          5  0.785773
9           100  0.1          1.0          5  0.784163


In [None]:
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
# Cuanto mayor la tasa de aprendizaje, más rápido aprende el modelo, pero también puede sobreajustar más rápido
# Vamos a probar con una tasa de aprendizaje mas alta

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.3          1.0          3  0.809838
1           100  0.3          1.0          3  0.806977
2            50  0.3          1.0          3  0.802014
3           150  0.1          0.6          3  0.798998
4           150  0.1          1.0          3  0.792579
5           150  0.2          0.8          3  0.791567
6           100  0.2          0.8          3  0.790717
7           100  0.1          0.6          3  0.790011
8           150  0.2          1.0          3  0.789513
9           100  0.2          1.0          3  0.788360


In [None]:
param_grid = {
    'n_estimators': [ 50, 100, 150],
    'lr': [0.1, 0.2, 0.3, 0.4],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}
grid_search_seq_ensemble(Xh_train, yh_train, param_grid, cv)
#  Con tasa de aprendizaje mayor a 0.3 el modelo empieza a sobreajustar, por lo que es mejor usar una tasa de aprendizaje menor

Top 10 combos:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           150  0.3          1.0          3  0.809838
1           100  0.3          1.0          3  0.806977
2            50  0.3          1.0          3  0.802014
3           150  0.1          0.6          3  0.798998
4           150  0.1          1.0          3  0.792579
5           150  0.2          0.8          3  0.791567
6           100  0.2          0.8          3  0.790717
7           100  0.1          0.6          3  0.790011
8           150  0.2          1.0          3  0.789513
9           100  0.2          1.0          3  0.788360


Evaluación en test con los mejores hiperparámetros

In [54]:
best_house = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    lr=0.1,
    sample_size=0.6,
    random_state=42,
    max_depth=5
)
best_house.fit(Xh_train, yh_train)
yh_test_pred = best_house.predict(Xh_test)
print("House Test R²:", r2_score(yh_test, yh_test_pred))
print("House Test MAE:", mean_absolute_error(yh_test, yh_test_pred))


House Test R²: 0.6860433508904503
House Test MAE: 28189.542855420998


In [55]:
best_park = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    lr=0.1,
    sample_size=0.8,
    random_state=42,
    max_depth=7
)
best_park.fit(Xp_train, yp_train)
yp_test_pred = best_park.predict(Xp_test)
print("Parkinsons Test R²:", r2_score(yp_test, yp_test_pred))
print("Parkinsons Test MAE:", mean_absolute_error(yp_test, yp_test_pred))

Parkinsons Test R²: 0.9203707106305425
Parkinsons Test MAE: 1.9512256226341753


In [None]:
class SequentialEnsembleRegressorEarlyStopping(BaseEstimator, RegressorMixin):
    def __init__(self, estimator=None, n_estimators=50, sample_size=0.8, lr=0.1, random_state=None, patience=10, **estimator_params):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.patience = patience
        self.estimator_params = estimator_params

    def fit(self, X, y, X_val=None, y_val=None):
        self.models = []
        pred = np.zeros_like(y, dtype=float)
        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        best_score = -np.inf
        best_iter = 0
        patience_counter = 0
        val_scores = []
        for m in range(self.n_estimators):
            resid = y - pred
            k = int(self.sample_size * n_samples)
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx]
            y_sub = resid.iloc[idx] if hasattr(resid, "iloc") else resid[idx]
            model = clone(self.estimator)
            if self.estimator_params:
                model.set_params(**self.estimator_params)
            model.fit(X_sub, y_sub)
            self.models.append(model)
            pred += self.lr * model.predict(X)
            # Early stopping logic
            if X_val is not None and y_val is not None:
                val_pred = self.predict(X_val)
                score = r2_score(y_val, val_pred)
                val_scores.append(score)
                if score > best_score:
                    best_score = score
                    best_iter = m
                    patience_counter = 0
                else:
                    patience_counter += 1
                if patience_counter >= self.patience:
                    # Keep only the best models
                    self.models = self.models[:best_iter+1]
                    break
        return self

    def predict(self, X):
        pred = np.zeros(X.shape[0], dtype=float)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred

# Split a validation set from the training data for early stopping
Xh_tr, Xh_val, yh_tr, yh_val = train_test_split(Xh_train, yh_train, test_size=0.2, random_state=42)

es = SequentialEnsembleRegressorEarlyStopping(
    estimator=DecisionTreeRegressor(max_depth=5),
    n_estimators=100,
    sample_size=0.6,
    lr=0.1,
    random_state=42,
    patience=10
)
es.fit(Xh_tr, yh_tr, X_val=Xh_val, y_val=yh_val)

yh_es = es.predict(Xh_test)
print("R² con early stopping:", r2_score(yh_test, yh_es))
print("MAE con early stopping:", mean_absolute_error(yh_test, yh_es))

R² con early stopping: 0.6568492083061743
MAE con early stopping: 29666.000529760913


================================

Validación cruzada inicial con LinearRegression

In [58]:
base_lr = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

scores_h_lr = cross_val_score(base_lr, Xh_train, yh_train, 
                              scoring='r2', cv=cv, n_jobs=-1)
scores_p_lr = cross_val_score(base_lr, Xp_train, yp_train, 
                              scoring='r2', cv=cv, n_jobs=-1)

print("House CV R² (LR):", scores_h_lr, "→ media:", scores_h_lr.mean())
print("Parkinsons CV R² (LR):", scores_p_lr, "→ media:", scores_p_lr.mean())

House CV R² (LR): [0.61091079 0.77326417 0.65353687 0.82132503 0.44813318] → media: 0.661434009385928
Parkinsons CV R² (LR): [0.20771185 0.22932094 0.12436079 0.12846975 0.13593885] → media: 0.16516043553660498


Busqueda manual de mejores hiperparámetros

In [59]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'fit_intercept': [True, False]
}

def grid_search(X, y):
    results = []
    for n in param_grid['n_estimators']:
        for lr in param_grid['lr']:
            for s in param_grid['sample_size']:
                for fi in param_grid['fit_intercept']:
                    model = SequentialEnsembleRegressor(
                        estimator=LinearRegression(),
                        n_estimators=n,
                        lr=lr,
                        sample_size=s,
                        random_state=42,
                        fit_intercept=fi
                    )
                    scores = cross_val_score(
                        model, X, y, scoring='r2', cv=cv, n_jobs=-1
                    )
                    results.append({
                        'n_estimators': n,
                        'lr': lr,
                        'sample_size': s,
                        'fit_intercept': fi,
                        'r2_mean': scores.mean()
                    })
    return pd.DataFrame(results).sort_values('r2_mean', ascending=False)

df_h_lr = grid_search(Xh_train, yh_train)
df_p_lr = grid_search(Xp_train, yp_train)

print("Top 5 combos House (LR):\n", df_h_lr.head(5))
print("\nTop 5 combos Parkinsons (LR):\n", df_p_lr.head(5))

Top 5 combos House (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
29            50  0.1          1.0          False  0.703009
28            50  0.1          1.0           True  0.703009
47           100  0.1          1.0          False  0.701917
46           100  0.1          1.0           True  0.701917
35            50  0.2          1.0          False  0.701913

Top 5 combos Parkinsons (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
33            50  0.2          0.8          False  0.165678
32            50  0.2          0.8           True  0.165678
27            50  0.1          0.8          False  0.165160
26            50  0.1          0.8           True  0.165160
51           100  0.2          0.8          False  0.165142


Evaluación final del test

In [60]:
#Extraer top-1 de House
best_h = df_h_lr.iloc[0]
best_lr_h = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=int(best_h['n_estimators']),
    lr=float(best_h['lr']),
    sample_size=float(best_h['sample_size']),
    random_state=42,
    fit_intercept=bool(best_h['fit_intercept'])
)
best_lr_h.fit(Xh_train, yh_train)
yh_lr_pred = best_lr_h.predict(Xh_test)
print("House Test (LR) R²:", r2_score(yh_test, yh_lr_pred),
      "MAE:", mean_absolute_error(yh_test, yh_lr_pred))

House Test (LR) R²: 0.7874562047260576 MAE: 25036.264220148114


In [61]:
# Extraer top-1 de Parkinsons
best_p = df_p_lr.iloc[0]
best_lr_p = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=int(best_p['n_estimators']),
    lr=float(best_p['lr']),
    sample_size=float(best_p['sample_size']),
    random_state=42,
    fit_intercept=bool(best_p['fit_intercept'])
)
best_lr_p.fit(Xp_train, yp_train)
yp_lr_pred = best_lr_p.predict(Xp_test)
print("Parkinsons Test (LR) R²:", r2_score(yp_test, yp_lr_pred),
      "MAE:", mean_absolute_error(yp_test, yp_lr_pred))

Parkinsons Test (LR) R²: 0.10175040386290357 MAE: 8.249848609060109


EarlyStoping

In [62]:
es_lr = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=100,
    sample_size=1.0,      # en tu grid buscaste sample_size=1.0
    lr=0.1,
    random_state=42,
    patience=10,          # tolera 10 iteraciones sin mejora
    fit_intercept=True    # o False, según tu mejor combo
)
es_lr.fit(Xh_train, yh_train)
yh_es_lr = es_lr.predict(Xh_test)

print("House (LR + early stopping) R²:", r2_score(yh_test, yh_es_lr))
print("House (LR + early stopping) MAE:", mean_absolute_error(yh_test, yh_es_lr))

House (LR + early stopping) R²: 0.7914746899568399
House (LR + early stopping) MAE: 24716.9139875049


In [63]:
es_lr_p = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=100,
    sample_size=0.8,
    lr=0.2,
    random_state=42,
    patience=10,
    fit_intercept=True
)
es_lr_p.fit(Xp_train, yp_train)
yp_es_lr = es_lr_p.predict(Xp_test)

print("Parkinsons (LR + early stopping) R²:", r2_score(yp_test, yp_es_lr))
print("Parkinsons (LR + early stopping) MAE:", mean_absolute_error(yp_test, yp_es_lr))

Parkinsons (LR + early stopping) R²: 0.1430420079518817
Parkinsons (LR + early stopping) MAE: 8.11477793461102
