In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

In [35]:
house = pd.read_csv('house_prices.csv')
parkinson = pd.read_csv('parkinsons.csv')

In [36]:
cat_cols = house.select_dtypes(include=['object']).columns.tolist()
print("Columnas categóricas en house:", cat_cols)

Columnas categóricas en house: ['Condition2', 'LandContour', 'HouseStyle', 'GarageType', 'FireplaceQu', 'Alley', 'MSZoning', 'CentralAir', 'GarageCond', 'Exterior2nd', 'BsmtFinType2', 'GarageQual', 'ExterQual', 'PavedDrive', 'LotShape', 'KitchenQual', 'SaleType', 'BsmtExposure', 'ExterCond', 'BsmtQual', 'MiscFeature', 'PoolQC']


In [37]:
house_enc = pd.get_dummies(house, columns=cat_cols, drop_first=True)

#Separar features y target en house

In [38]:
X_house = house_enc.drop('SalePrice', axis=1)
y_house = house_enc['SalePrice']
print("Dimensiones X_house:", X_house.shape)
print("Dimensiones y_house:", y_house.shape)

Dimensiones X_house: (560, 109)
Dimensiones y_house: (560,)


#Para Parkinsons no hay categóricas, solo separamos

In [39]:
X_parkinson = parkinson.drop('total_UPDRS', axis=1)
y_parkinson = parkinson['total_UPDRS']
print("Dimensiones X_parkinson:", X_parkinson.shape)
print("Dimensiones y_parkinson:", y_parkinson.shape)

Dimensiones X_parkinson: (2000, 19)
Dimensiones y_parkinson: (2000,)


#Train test split para los dos

In [40]:
Xh_train, Xh_test, yh_train, yh_test = train_test_split(
    X_house, y_house, test_size=0.2, random_state=42
)
print("House split:", Xh_train.shape, Xh_test.shape, yh_train.shape, yh_test.shape)

House split: (448, 109) (112, 109) (448,) (112,)


In [41]:
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_parkinson, y_parkinson, test_size=0.2, random_state=42
)
print("Parkinsons split:", Xp_train.shape, Xp_test.shape, yp_train.shape, yp_test.shape)


Parkinsons split: (1600, 19) (400, 19) (1600,) (400,)


In [None]:
class SequentialEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 estimator=None,
                 n_estimators=50,
                 sample_size=0.8,
                 lr=0.1,
                 random_state=None,
                 **estimator_params):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.estimator_params = estimator_params

    def fit(self, X, y):
        self.models = []
        pred = np.zeros_like(y, dtype=float)
        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        for m in range(self.n_estimators):
            resid = y - pred
            k = int(self.sample_size * n_samples)
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx]
            y_sub = resid.iloc[idx] if hasattr(resid, "iloc") else resid[idx]
            model = clone(self.estimator)
            if self.estimator_params:
                model.set_params(**self.estimator_params)
            model.fit(X_sub, y_sub)
            self.models.append(model)
            pred += self.lr * model.predict(X)
        return self

    def predict(self, X):
        pred = np.zeros(X.shape[0], dtype=float)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred

Evaluar Housing

In [43]:
ens_h = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [44]:
ens_h.fit(Xh_train, yh_train)
yh_pred = ens_h.predict(Xh_test)
print("House R²:", r2_score(yh_test, yh_pred))
print("House MAE:", mean_absolute_error(yh_test, yh_pred))

House R²: 0.6760382492932894
House MAE: 28603.289133548788


Evaluar parkingson

In [45]:
ens_p = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42,
    max_depth=3
)

In [46]:
ens_p.fit(Xp_train, yp_train)
yp_pred = ens_p.predict(Xp_test)
print("Parkinsons R²:", r2_score(yp_test, yp_pred))
print("Parkinsons MAE:", mean_absolute_error(yp_test, yp_pred))

Parkinsons R²: 0.6666546286487727
Parkinsons MAE: 4.844044073631054


Validación cruzada y busqueda de hiperparametros

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    SequentialEnsembleRegressor(
        estimator=DecisionTreeRegressor(max_depth=3),
        n_estimators=50, sample_size=0.8, lr=0.1, random_state=42
    ),
    Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("House CV R²:", scores, "→ Media:", scores.mean())

House CV R²: [0.73142608 0.77377505 0.7757943  0.78769204 0.79441317] → Media: 0.7726201277897204


In [48]:
#Modelo base con hiperparámetros iniciales
base_model = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

In [49]:
scores_house = cross_val_score(
    base_model, Xh_train, yh_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("House CV R² scores:", scores_house)
print("Media R²:", scores_house.mean())

House CV R² scores: [0.73193394 0.77791167 0.76817274 0.77689717 0.78820782]
Media R²: 0.7686246652244594


Parkinson

In [50]:
scores_parkinson = cross_val_score(
    base_model, Xp_train, yp_train,
    scoring='r2', cv=cv, n_jobs=-1
)
print("Parkinsons CV R² scores:", scores_parkinson)
print("Media R²:", scores_parkinson.mean())

Parkinsons CV R² scores: [0.63117863 0.6606221  0.58556199 0.65776949 0.6785615 ]
Media R²: 0.6427387400343975


Busqueda manual de hiperparámetros

In [51]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'max_depth': [3, 5, 7]
}

results = []
for n in param_grid['n_estimators']:
    for lr in param_grid['lr']:
        for s in param_grid['sample_size']:
            for md in param_grid['max_depth']:
                model = SequentialEnsembleRegressor(
                    estimator=DecisionTreeRegressor(),
                    n_estimators=n,
                    lr=lr,
                    sample_size=s,
                    random_state=42,
                    max_depth=md
                )
                scores = cross_val_score(
                    model, Xh_train, yh_train,
                    scoring='r2', cv=cv, n_jobs=-1
                )
                results.append({
                    'n_estimators': n,
                    'lr': lr,
                    'sample_size': s,
                    'max_depth': md,
                    'r2_mean': scores.mean()
                })

df_results = pd.DataFrame(results)
df_results = df_results.sort_values('r2_mean', ascending=False).reset_index(drop=True)
print("Top 10 combos House:\n", df_results.head(10))


Top 10 combos House:
    n_estimators   lr  sample_size  max_depth   r2_mean
0           100  0.1          0.6          3  0.771539
1           100  0.1          0.8          7  0.766776
2            50  0.1          0.6          3  0.766628
3           100  0.2          0.6          3  0.761672
4           100  0.1          0.8          3  0.761471
5           100  0.1          0.6          5  0.760652
6            50  0.1          0.8          7  0.759123
7            50  0.1          0.6          5  0.757473
8           100  0.1          0.6          7  0.756403
9            50  0.2          0.8          5  0.755908


In [52]:
# Mejor combinación encontrada:
# n_estimators=100
# lr=0.1
# sample_size=0.6
# max_depth=5
# r2_mean≈0.7721
#CV (5-fold) sobre train → media R² ≈ 0.7639

# Observaciones rápidas:
# Aumentar n_estimators a 100 sigue mejorando ligeramente el R².
# Un sample_size más pequeño (0.6) favorece la diversidad de los modelos débiles.
# Profundidades intermedias (max_depth=5) suelen dar mejores resultados que muy superficiales o muy profundas.

In [53]:
results_p = []
for n in param_grid['n_estimators']:
    for lr in param_grid['lr']:
        for s in param_grid['sample_size']:
            for md in param_grid['max_depth']:
                model = SequentialEnsembleRegressor(
                    estimator=DecisionTreeRegressor(),
                    n_estimators=n,
                    lr=lr,
                    sample_size=s,
                    random_state=42,
                    max_depth=md
                )
                scores = cross_val_score(
                    model, Xp_train, yp_train,
                    scoring='r2', cv=cv, n_jobs=-1
                )
                results_p.append({
                    'n_estimators': n,
                    'lr': lr,
                    'sample_size': s,
                    'max_depth': md,
                    'r2_mean': scores.mean()
                })

df_p = pd.DataFrame(results_p)
df_p = df_p.sort_values('r2_mean', ascending=False).reset_index(drop=True)
print("Top 5 combos Parkinsons:\n", df_p.head(5))

Top 5 combos Parkinsons:
    n_estimators   lr  sample_size  max_depth   r2_mean
0            50  0.1          0.8          5  0.917478
1           100  0.1          0.8          7  0.916439
2           100  0.1          0.8          5  0.916409
3            50  0.1          0.8          3  0.916370
4           100  0.1          0.8          3  0.916237


Evaluación en test con los mejores hiperparámetros

In [54]:
best_house = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    lr=0.1,
    sample_size=0.6,
    random_state=42,
    max_depth=5
)
best_house.fit(Xh_train, yh_train)
yh_test_pred = best_house.predict(Xh_test)
print("House Test R²:", r2_score(yh_test, yh_test_pred))
print("House Test MAE:", mean_absolute_error(yh_test, yh_test_pred))


House Test R²: 0.6860433508904503
House Test MAE: 28189.542855420998


In [55]:
best_park = SequentialEnsembleRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    lr=0.1,
    sample_size=0.8,
    random_state=42,
    max_depth=7
)
best_park.fit(Xp_train, yp_train)
yp_test_pred = best_park.predict(Xp_test)
print("Parkinsons Test R²:", r2_score(yp_test, yp_test_pred))
print("Parkinsons Test MAE:", mean_absolute_error(yp_test, yp_test_pred))

Parkinsons Test R²: 0.9203707106305425
Parkinsons Test MAE: 1.9512256226341753


In [None]:
class SequentialEnsembleRegressorEarlyStopping(BaseEstimator, RegressorMixin):
    def __init__(self, estimator=None, n_estimators=50, sample_size=0.8, lr=0.1, random_state=None, patience=10, **estimator_params):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        self.lr = lr
        self.random_state = random_state
        self.patience = patience
        self.estimator_params = estimator_params

    def fit(self, X, y, X_val=None, y_val=None):
        self.models = []
        pred = np.zeros_like(y, dtype=float)
        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        best_score = -np.inf
        best_iter = 0
        patience_counter = 0
        val_scores = []
        for m in range(self.n_estimators):
            resid = y - pred
            k = int(self.sample_size * n_samples)
            idx = rng.choice(n_samples, k, replace=False)
            X_sub = X.iloc[idx]
            y_sub = resid.iloc[idx] if hasattr(resid, "iloc") else resid[idx]
            model = clone(self.estimator)
            if self.estimator_params:
                model.set_params(**self.estimator_params)
            model.fit(X_sub, y_sub)
            self.models.append(model)
            pred += self.lr * model.predict(X)
            # Early stopping logic
            if X_val is not None and y_val is not None:
                val_pred = self.predict(X_val)
                score = r2_score(y_val, val_pred)
                val_scores.append(score)
                if score > best_score:
                    best_score = score
                    best_iter = m
                    patience_counter = 0
                else:
                    patience_counter += 1
                if patience_counter >= self.patience:
                    # Keep only the best models
                    self.models = self.models[:best_iter+1]
                    break
        return self

    def predict(self, X):
        pred = np.zeros(X.shape[0], dtype=float)
        for m in self.models:
            pred += self.lr * m.predict(X)
        return pred

# Split a validation set from the training data for early stopping
Xh_tr, Xh_val, yh_tr, yh_val = train_test_split(Xh_train, yh_train, test_size=0.2, random_state=42)

es = SequentialEnsembleRegressorEarlyStopping(
    estimator=DecisionTreeRegressor(max_depth=5),
    n_estimators=100,
    sample_size=0.6,
    lr=0.1,
    random_state=42,
    patience=10
)
es.fit(Xh_tr, yh_tr, X_val=Xh_val, y_val=yh_val)

yh_es = es.predict(Xh_test)
print("R² con early stopping:", r2_score(yh_test, yh_es))
print("MAE con early stopping:", mean_absolute_error(yh_test, yh_es))

R² con early stopping: 0.6568492083061743
MAE con early stopping: 29666.000529760913


================================

Validación cruzada inicial con LinearRegression

In [58]:
base_lr = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    sample_size=0.8,
    lr=0.1,
    random_state=42
)

scores_h_lr = cross_val_score(base_lr, Xh_train, yh_train, 
                              scoring='r2', cv=cv, n_jobs=-1)
scores_p_lr = cross_val_score(base_lr, Xp_train, yp_train, 
                              scoring='r2', cv=cv, n_jobs=-1)

print("House CV R² (LR):", scores_h_lr, "→ media:", scores_h_lr.mean())
print("Parkinsons CV R² (LR):", scores_p_lr, "→ media:", scores_p_lr.mean())

House CV R² (LR): [0.61091079 0.77326417 0.65353687 0.82132503 0.44813318] → media: 0.661434009385928
Parkinsons CV R² (LR): [0.20771185 0.22932094 0.12436079 0.12846975 0.13593885] → media: 0.16516043553660498


Busqueda manual de mejores hiperparámetros

In [59]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'lr': [0.01, 0.1, 0.2],
    'sample_size': [0.6, 0.8, 1.0],
    'fit_intercept': [True, False]
}

def grid_search(X, y):
    results = []
    for n in param_grid['n_estimators']:
        for lr in param_grid['lr']:
            for s in param_grid['sample_size']:
                for fi in param_grid['fit_intercept']:
                    model = SequentialEnsembleRegressor(
                        estimator=LinearRegression(),
                        n_estimators=n,
                        lr=lr,
                        sample_size=s,
                        random_state=42,
                        fit_intercept=fi
                    )
                    scores = cross_val_score(
                        model, X, y, scoring='r2', cv=cv, n_jobs=-1
                    )
                    results.append({
                        'n_estimators': n,
                        'lr': lr,
                        'sample_size': s,
                        'fit_intercept': fi,
                        'r2_mean': scores.mean()
                    })
    return pd.DataFrame(results).sort_values('r2_mean', ascending=False)

df_h_lr = grid_search(Xh_train, yh_train)
df_p_lr = grid_search(Xp_train, yp_train)

print("Top 5 combos House (LR):\n", df_h_lr.head(5))
print("\nTop 5 combos Parkinsons (LR):\n", df_p_lr.head(5))

Top 5 combos House (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
29            50  0.1          1.0          False  0.703009
28            50  0.1          1.0           True  0.703009
47           100  0.1          1.0          False  0.701917
46           100  0.1          1.0           True  0.701917
35            50  0.2          1.0          False  0.701913

Top 5 combos Parkinsons (LR):
     n_estimators   lr  sample_size  fit_intercept   r2_mean
33            50  0.2          0.8          False  0.165678
32            50  0.2          0.8           True  0.165678
27            50  0.1          0.8          False  0.165160
26            50  0.1          0.8           True  0.165160
51           100  0.2          0.8          False  0.165142


Evaluación final del test

In [60]:
#Extraer top-1 de House
best_h = df_h_lr.iloc[0]
best_lr_h = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=int(best_h['n_estimators']),
    lr=float(best_h['lr']),
    sample_size=float(best_h['sample_size']),
    random_state=42,
    fit_intercept=bool(best_h['fit_intercept'])
)
best_lr_h.fit(Xh_train, yh_train)
yh_lr_pred = best_lr_h.predict(Xh_test)
print("House Test (LR) R²:", r2_score(yh_test, yh_lr_pred),
      "MAE:", mean_absolute_error(yh_test, yh_lr_pred))

House Test (LR) R²: 0.7874562047260576 MAE: 25036.264220148114


In [61]:
# Extraer top-1 de Parkinsons
best_p = df_p_lr.iloc[0]
best_lr_p = SequentialEnsembleRegressor(
    estimator=LinearRegression(),
    n_estimators=int(best_p['n_estimators']),
    lr=float(best_p['lr']),
    sample_size=float(best_p['sample_size']),
    random_state=42,
    fit_intercept=bool(best_p['fit_intercept'])
)
best_lr_p.fit(Xp_train, yp_train)
yp_lr_pred = best_lr_p.predict(Xp_test)
print("Parkinsons Test (LR) R²:", r2_score(yp_test, yp_lr_pred),
      "MAE:", mean_absolute_error(yp_test, yp_lr_pred))

Parkinsons Test (LR) R²: 0.10175040386290357 MAE: 8.249848609060109


EarlyStoping

In [62]:
es_lr = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=100,
    sample_size=1.0,      # en tu grid buscaste sample_size=1.0
    lr=0.1,
    random_state=42,
    patience=10,          # tolera 10 iteraciones sin mejora
    fit_intercept=True    # o False, según tu mejor combo
)
es_lr.fit(Xh_train, yh_train)
yh_es_lr = es_lr.predict(Xh_test)

print("House (LR + early stopping) R²:", r2_score(yh_test, yh_es_lr))
print("House (LR + early stopping) MAE:", mean_absolute_error(yh_test, yh_es_lr))

House (LR + early stopping) R²: 0.7914746899568399
House (LR + early stopping) MAE: 24716.9139875049


In [63]:
es_lr_p = SequentialEnsembleRegressorEarlyStopping(
    estimator=LinearRegression(),
    n_estimators=100,
    sample_size=0.8,
    lr=0.2,
    random_state=42,
    patience=10,
    fit_intercept=True
)
es_lr_p.fit(Xp_train, yp_train)
yp_es_lr = es_lr_p.predict(Xp_test)

print("Parkinsons (LR + early stopping) R²:", r2_score(yp_test, yp_es_lr))
print("Parkinsons (LR + early stopping) MAE:", mean_absolute_error(yp_test, yp_es_lr))

Parkinsons (LR + early stopping) R²: 0.1430420079518817
Parkinsons (LR + early stopping) MAE: 8.11477793461102
