In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas.api.types import is_string_dtype

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn import clone
#from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Input
from scikeras.wrappers import KerasRegressor

In [4]:
# DOCUMENTATION CELL
SecuencialEnsembleDoc = '''
    Ensembles the different models to create a meta-model with a much better accuracy (of course at a higher resource demand). It's similar to Gradient Boosting
    This is done through using the same training algorithm (it must be supervised automatic/machine learning, that is, 
    one must know the expected solution beforehand, unlike other techniques such as reinforcement learning) with different data in each iteration, 
    so that the i model tries to compensate the i-1 one flaws (the number of models comes from n_estimators). This could also be achived through 
    using different algorithms over the same data, but that was not the case of this project. Also, it can be done sequentally or in parallel, 
    with this project treating the first case. The meta-model bases the compensation of each model's flaws through their erorr's gradient minimization.ç
    In this case, the error chosen was the cuadratic error. As it can be seen, this is an academic project that has every decision taken, 
    so many more implementations and combinations can be taken into account to develop different ensemble algorithms, with this being one good 
    possible solution.
    The algorithms that train data must be Regressors, since we need continuous predictions for both clasification and regression problems
    '''

In [None]:
# De los csv solo deben convertirse las variables categóricas (texto) a numéricas, usando el método que considere más adecuado.
np.random.seed(357823)

In [None]:
tree = DecisionTreeRegressor(max_depth=3)  # Avoid overadjustment (memorizing the training data) with small trees

In [None]:
'''
# Step 1: Initialize initial predictions (pred_0) 

# Step 2: Iterate through all estimators
    # 1. Compute residuals (error between true values and current predictions)
    remainder_i = y - actual_pred  
    # 2. Train a new model (estimator_i) to predict remainders
    estimator_i.fit(X, remainder_i)  
    # 3. Get predictions from the new model
    pred_i = estimator_i.predict(X)  
    # 4. Update predictions with learning rate (lr)
    actual_pred = actual_pred + lr * pred_i
    # 5. OPTIONAL -> Early Stopping
    obtain a specified % of the 80% of the original data which was used in step 2 (training) and use the leftover to evaluate and the % to ¿re-train?

# Step 3: Return all trained models
'''

In [48]:

def transform_csv(csv:str) -> DataFrame:
    '''
    Objective variable must be at the end! Reads a csv and encodes the string values with an sklearn's OrdinalEncoder, excluding the last column.
    '''
    data = pd.read_csv(csv)
    dicrete_atributes = []
    continous_atributes = []
    discrete_atributes_encoder = OrdinalEncoder()
    for column_name in data.columns:
        if (is_string_dtype(data[column_name])):
            dicrete_atributes.append(column_name)
        else:
            continous_atributes.append(column_name)
    atributes = data.iloc[:, 0:-1]
    discrete_atributes_encoder.fit(atributes[dicrete_atributes])
    atributes[dicrete_atributes] = discrete_atributes_encoder.transform(atributes[dicrete_atributes])
    return (atributes,data.iloc[:, -1])

class SequentialEnsemble(BaseEstimator, RegressorMixin):
    __doc__ = SecuencialEnsembleDoc # can be read through help(SecuencialEnsemble)

    def __init__(self, trainingModel, csv:str, n_estimators:int = 15, lr:float = 0.01, sample_size:float = 0.75, task_type: str = "regression") -> None:  
        self.models = []
        self.n_estimators = n_estimators
        self.trainingModel = trainingModel  
        self.lr = lr # regulates the importance of each model's prediction
        self.sample_size = sample_size  
        #self.originalData = transform_csv(csv)[0]
        #self.trainingData = []
        #self.individualObjective = transform_csv(csv)[1]
        self.X_train = None
        self.y_train = None
        self.task_type = task_type.lower()

    def fit(self, X, y) -> None: # I pass the attributes to keep the usual format
        #pred_actual = np.mean(self.individualObjective) * np.ones_like(self.individualObjective)  # Predicción inicial  
        self.X_train = X.values
        self.y_train = y.values.ravel()
        pred_actual = np.mean(self.y_train) * np.ones(len(self.y_train))
        for i in range(self.n_estimators):
            # 1. Calculate remainder (gradient of the error (cuadratic here) of all models (pred_actual defines the prediction of all models until i))  
            #remainder = self.individualObjective - pred_actual
            remainder = self.y_train - pred_actual # it is needed to use values because without it remainder would be a pandar series with unproper indicies.
            # 0. Random sampling  
            #n_samples = int(len(self.originalData) * self.sample_size)
            #idx = np.random.choice(len(self.originalData), n_samples, replace=False)
            #X_sample = self.originalData.iloc[idx]
            #remainder_sample = remainder[idx]
            n_samples = int(len(self.X_train) * self.sample_size)
            idx = np.random.choice(len(self.X_train), n_samples, replace=False)
            X_sample = self.X_train[idx]
            remainder_sample = remainder[idx]
            # 2. Train base model
            if callable(self.trainingModel):  # A function to create the neural network
                input_shape = X_sample.shape[1]
                model = self.trainingModel(input_shape)  # Create a new network in each iteration
            else:
                model = clone(self.trainingModel) # clone to be able to save each trained model. If self.trainingModel were to be directly trained (using fit) it would keep being constantly overwritten (thus overwriting the models saved in self.models), since we would be modifying the reference to the object, not creating a new one.
            model.fit(X_sample, remainder_sample)  
            # 3. Update predictions  
            pred_actual[idx] += self.lr * model.predict(X_sample).flatten()  
            # 5. Save model  
            self.models.append(model)

    def predict(self, X, y) -> float:
        '''
        Returns the prediction of the previously trained (fit) meta-model for an objective variable of a given data file compatible with pandas.
        ''' 
        X_array = X.values
        y_true = y.values
        #pred = np.zeros(len(X))
        pred = np.mean(y) * np.ones(len(X_array))  
        for model in self.models:
            pred += self.lr * model.predict(X_array)
        if self.task_type == "classification":
            # Sigmoide to escalate to [0, 1] for classification tasks.
            pred = 1 / (1 + np.exp(-pred))  
            pred = (pred >= 0.5).astype(int)
        r2 = r2_score(y_true, pred)  
        return (pred, r2)

In [19]:
original_data, individual_objective = transform_csv("csv/house_prices.csv")

# 2. Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    original_data, individual_objective,
    test_size=0.2, random_state=42
)

In [None]:
ensemble_tree = SequentialEnsemble(
    trainingModel=DecisionTreeRegressor(max_depth=5), # # Avoid overadjustment (memorizing the training data) with small trees
    csv="csv/house_prices.csv",
    n_estimators = 1509
)
ensemble_tree.fit(X_train, y_train)
predicciones = ensemble_tree.predict(X_test, y_test)
print('R2 = ', predicciones[1])

R2 =  0.6832876726447409


In [None]:
def create_network(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),  # Capa de entrada explícita
        Dense(4, activation='relu'),
        Dense(1)  # Salida lineal para regresión
    ])
    model.compile(optimizer='adam', loss='mse')
    return model  

def create_Regressor_network(input_shape) -> KerasRegressor:
# Wrap the network in a Scikit-learn compatible estimator
    return KerasRegressor(
        model = lambda: create_network(input_shape),
        epochs=30,
        batch_size=32,
        verbose=0
    )

ensemble_neural_network = SequentialEnsemble(
    trainingModel=create_Regressor_network,
    csv="csv/house_prices.csv"
)
ensemble_neural_network.fit(X_train, y_train)
predicciones = ensemble_neural_network.predict(X_test, y_test)
print('R2 = ', predicciones[1])

R2 =  0.00026047084289626543


In [140]:
ensemble_Bayes = SequentialEnsemble(
    trainingModel=BayesianRidge(),
    csv="csv/house_prices.csv",
    n_estimators = 150
)
ensemble_Bayes.fit(X_train, y_train)
predicciones = ensemble_Bayes.predict(X_test, y_test)
print('R2 = ', predicciones[1])

R2 =  0.44485507067085095


In [4]:
transform_csv('csv/house_prices.csv')[0]

Unnamed: 0,GarageCars,Condition2,YearBuilt,GarageYrBlt,LandContour,LowQualFinSF,HouseStyle,GarageType,MSSubClass,WoodDeckSF,...,SaleType,MiscVal,BsmtExposure,OpenPorchSF,ExterCond,Fireplaces,FullBath,BsmtQual,MiscFeature,PoolQC
0,2,2.0,1962,1977.0,3.0,0,2.0,5.0,20,0,...,7.0,0,3.0,0,3.0,0,1,3.0,4.0,2.0
1,0,2.0,1914,0.0,3.0,0,4.0,6.0,75,0,...,7.0,0,3.0,291,3.0,1,2,3.0,4.0,2.0
2,2,2.0,1999,1999.0,3.0,0,2.0,1.0,20,0,...,7.0,0,0.0,35,3.0,0,2,2.0,4.0,2.0
3,1,2.0,1948,1948.0,0.0,0,5.0,1.0,20,103,...,7.0,0,3.0,0,1.0,0,3,3.0,4.0,2.0
4,2,2.0,1950,1950.0,3.0,0,2.0,5.0,20,0,...,7.0,0,3.0,29,3.0,0,1,4.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,1,2.0,1959,1959.0,3.0,0,7.0,1.0,80,86,...,7.0,0,0.0,0,1.0,1,1,3.0,4.0,2.0
556,1,2.0,1934,1939.0,1.0,0,5.0,5.0,70,0,...,7.0,0,3.0,0,3.0,1,1,3.0,4.0,2.0
557,2,2.0,1882,1925.0,3.0,0,5.0,1.0,70,0,...,7.0,0,3.0,169,1.0,1,1,3.0,4.0,2.0
558,1,2.0,1953,1953.0,3.0,0,2.0,1.0,20,0,...,7.0,0,3.0,18,3.0,0,1,3.0,4.0,2.0


In [None]:
# TESTING CELL
originalData = pd.read_csv('csv/house_prices.csv')
#data.iloc[0]
#data.iloc[0]['GarageCars']
originalData.columns
#data.iloc[0,1]
#key = data.iloc[0,:1]
#key
#is_string_dtype(data[data.columns[4]])
#is_string_dtype(data[data.columns[4]])
#data.columns[0]
originalData["PoolQC"]

0      none
1      none
2      none
3      none
4      none
       ... 
555    none
556    none
557    none
558    none
559    none
Name: PoolQC, Length: 560, dtype: object