In [36]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas.api.types import is_string_dtype

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn import clone
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

from keras.models import Sequential
from keras.layers import Dense, Input
from scikeras.wrappers import KerasRegressor

In [3]:
# DOCUMENTATION CELL
SecuencialEnsembleDoc = '''
    Ensembles the different models to create a meta-model with a much better accuracy (of course at a higher resource demand). It's similar to Gradient Boosting
    This is done through using the same training algorithm (it must be supervised automatic/machine learning, that is, 
    one must know the expected solution beforehand, unlike other techniques such as reinforcement learning) with different data in each iteration, 
    so that the i model tries to compensate the i-1 one flaws (the number of models comes from n_estimators). This could also be achived through 
    using different algorithms over the same data, but that was not the case of this project. Also, it can be done sequentally or in parallel, 
    with this project treating the first case. The meta-model bases the compensation of each model's flaws through their erorr's gradient minimization.ç
    In this case, the error chosen was the cuadratic error. As it can be seen, this is an academic project that has every decision taken, 
    so many more implementations and combinations can be taken into account to develop different ensemble algorithms, with this being one good 
    possible solution.
    The algorithms that train data must be Regressors, since we need continuous predictions for both clasification and regression problems
    ATTRIBUTES:
    trainingModel: the Regressor or algorithm to train the data 
    csv:str
    n_estimators:int
    lr:float
    sample_size:float
    task_type:str
    early_stopping_patience:int
    '''

In [2]:
# De los csv solo deben convertirse las variables categóricas (texto) a numéricas, usando el método que considere más adecuado.
np.random.seed(357823)

In [4]:
'''
# Step 1: Initialize initial predictions (pred_0) 

# Step 2: Iterate through all estimators
    # 1. Compute residuals (error between true values and current predictions)
    remainder_i = y - actual_pred  
    # 2. Train a new model (estimator_i) to predict remainders
    estimator_i.fit(X, remainder_i)  
    # 3. Get predictions from the new model
    pred_i = estimator_i.predict(X)  
    # 4. Update predictions with learning rate (lr)
    actual_pred = actual_pred + lr * pred_i
    # 5. OPTIONAL -> Early Stopping
    obtain a specified % of the 80% of the original data which was used in step 2 (training) and use the leftover to evaluate and the % to ¿re-train?

# Step 3: Return all trained models
'''

'\n# Step 1: Initialize initial predictions (pred_0) \n\n# Step 2: Iterate through all estimators\n    # 1. Compute residuals (error between true values and current predictions)\n    remainder_i = y - actual_pred  \n    # 2. Train a new model (estimator_i) to predict remainders\n    estimator_i.fit(X, remainder_i)  \n    # 3. Get predictions from the new model\n    pred_i = estimator_i.predict(X)  \n    # 4. Update predictions with learning rate (lr)\n    actual_pred = actual_pred + lr * pred_i\n    # 5. OPTIONAL -> Early Stopping\n    obtain a specified % of the 80% of the original data which was used in step 2 (training) and use the leftover to evaluate and the % to ¿re-train?\n\n# Step 3: Return all trained models\n'

In [258]:
def transform_csv(csv:str) -> DataFrame:
    '''
    Objective variable must be at the end! Reads a csv and encodes the string values with an sklearn's OrdinalEncoder, excluding the last column.
    '''
    data = pd.read_csv(csv)
    dicrete_atributes = []
    continous_atributes = []
    discrete_atributes_encoder = OrdinalEncoder()
    for column_name in data.columns:
        if (is_string_dtype(data[column_name])):
            dicrete_atributes.append(column_name)
        else:
            continous_atributes.append(column_name)
    atributes = data.iloc[:, 0:-1]
    discrete_atributes_encoder.fit(atributes[dicrete_atributes])
    atributes[dicrete_atributes] = discrete_atributes_encoder.transform(atributes[dicrete_atributes])
    return (atributes,data.iloc[:, -1])

class SequentialEnsemble(BaseEstimator, RegressorMixin):
    ''' Documentation can be read through help(SequentialEnsemble)'''
    __doc__ = SecuencialEnsembleDoc 
    #task_type: str = "regression"
    def __init__(self, trainingModel,objective,n_estimators:int = 15, lr:float = 0.01, sample_size:float = 0.75, epsilon = 10**(-4), early_stopping_patience: int = 6) -> None:  
        self.models = []
        self.trainingModel = trainingModel  
        self.objective = objective
        self.n_estimators = n_estimators
        self.lr = lr # regulates the importance of each model's prediction
        self.sample_size = sample_size
        self.epsilon = epsilon
        self.early_stopping_patience = early_stopping_patience
        # self.task_type = task_type.lower()

    def fit(self, X, y) -> None: # I pass the attributes to keep the usual format  
        pred_actual = np.mean(y) * np.ones(len(y))
        last_r2 = 0
        for i in range(self.n_estimators):
            # 1. Calculate remainder (gradient of the error (cuadratic here) of all models (pred_actual defines the prediction of all models until i))  
            remainder = y - pred_actual 
            # 0. Random sampling -> Check idx explanations
            n_samples = int(len(X) * self.sample_size)
            idx = np.random.choice(len(X), n_samples, replace=False)
            X_sample = X[idx]
            remainder_sample = remainder[idx]
            # 2. Train base model
            if callable(self.trainingModel):  # A function to create the neural network
                input_shape = X_sample.shape[1]
                model = self.trainingModel(input_shape)  # Create a new network in each iteration
            else:
                model = clone(self.trainingModel) # clone to be able to save each trained model. If self.trainingModel were to be directly trained (using fit) it would keep being constantly overwritten (thus overwriting the models saved in self.models), since we would be modifying the reference to the object, not creating a new one.
            model.fit(X_sample, remainder_sample)
            # 3. Update predictions  
            pred_actual[idx] += self.lr * model.predict(X_sample).flatten()

            # # 5. EARLY STOPPING
            actual_r2 = r2_score(y,pred_actual)
            print(i)
            if(abs(last_r2-actual_r2)<=self.epsilon):
                self.early_stopping_patience -=1
            if (self.early_stopping_patience == 0):
                break
            # 6. Save model  
            self.models.append(model)
            last_r2 = actual_r2

    def predict(self, X) -> float:
        '''
        Returns the prediction of the previously trained (fit) meta-model for an objective variable of a given data file compatible with pandas.
        ''' 
        pred = np.mean(self.objective) * np.ones(len(X))
        
        for model in self.models:
            pred += self.lr * model.predict(X)
        # if self.task_type == "classification":
        #     # Sigmoide to escalate to [0, 1] for classification tasks.
        #     pred = 1 / (1 + np.exp(-pred))  
        #     pred = (pred >= 0.5).astype(int)
        # r2 = r2_score(y, pred)  
        return (pred)#, r2)

In [260]:
#self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(original_data, individual_objective,test_size=test_size, random_state=42)
#self.X_train, self.X_test, self.y_train, self.y_test = self.X_train.values, self.X_test.values, self.y_train.values.ravel(), self.y_test.values.ravel()
#csv/house_prices.csv
original_data, individual_objective = transform_csv("csv/parkinsons.csv")
X_train, X_test, y_train, y_test = train_test_split(original_data, individual_objective,test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values.ravel(), y_test.values.ravel() # it is needed to use values or to_numpy() because without it remainder would be a pandas series with unproper indicies.
original_data = original_data.to_numpy()
individual_objective =  individual_objective.to_numpy()

In [261]:

res = cross_val_score(SequentialEnsemble(trainingModel=DecisionTreeRegressor(max_depth=5),objective=individual_objective,lr=0.05, n_estimators=500),original_data,individual_objective,scoring="r2",cv=10, n_jobs=-1)
print(np.mean(res))
print(res)
print("################################")

0.8785073947339809
[0.89891031 0.89862913 0.87416121 0.88223271 0.85119609 0.86562274
 0.88882665 0.88666877 0.87142208 0.86740425]
################################


In [248]:
res = cross_val_score(SequentialEnsemble(trainingModel=BayesianRidge(),objective=individual_objective,lr=0.06, n_estimators=70),original_data,individual_objective,scoring="r2",cv=10)
print(np.mean(res))
print(res)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
0
1
2
3
4
5
6
7
8
9
10
11


In [250]:
res = cross_val_score(SequentialEnsemble(trainingModel=KNeighborsRegressor(n_neighbors=5, weights='distance'),objective=individual_objective,lr=0.06, n_estimators=200),original_data,individual_objective,scoring="r2",cv=10, n_jobs=-1)
print(np.mean(res))
print(res)

0.32216599861615197
[0.43818253 0.41106517 0.19958813 0.38054645 0.28213282 0.23946371
 0.31387679 0.36352024 0.22844963 0.36483452]


In [None]:
res = cross_val_score(SequentialEnsemble(trainingModel=GaussianProcessRegressor(),objective=individual_objective,lr=0.06, n_estimators=70),original_data,individual_objective,scoring="r2",cv=10, n_jobs=-1)
np.mean(res)
print(res)

KeyboardInterrupt: 

In [232]:
ensemble_tree = SequentialEnsemble(
    trainingModel=DecisionTreeRegressor(max_depth=3), # Avoid overadjustment (memorizing the training data) with small trees
    objective=individual_objective,
    n_estimators = 500,
    lr=0.4,
)
ensemble_tree.fit(original_data, individual_objective)
predicciones = ensemble_tree.predict(X=original_data)
print('R2 = ', r2_score(individual_objective,predicciones))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
R2 =  0.8668889571402048


In [254]:
def create_network(input_shape) -> Sequential:
    model = Sequential([
        Input(shape=(input_shape,)),  # Data input layer
        Dense(4, activation='relu'),
        Dense(1)  # Linear output for regresion
    ])
    model.compile(optimizer='adam', loss='mse')
    return model  

def create_Regressor_network(input_shape) -> KerasRegressor:
# Wrap the network in a Scikit-learn compatible estimator
    return KerasRegressor(
        model = lambda: create_network(input_shape),
        epochs=30,
        batch_size=32,
        verbose=0
    )

ensemble_neural_network = SequentialEnsemble(
    trainingModel=create_Regressor_network,
    objective=individual_objective,
    lr=0.05
)
ensemble_neural_network.fit(original_data,individual_objective)
predicciones = ensemble_neural_network.predict(original_data)
print(r2_score(individual_objective,predicciones))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
0.03714450584340845


In [31]:
ensemble_Bayes = SequentialEnsemble(
    trainingModel=BayesianRidge(),
    csv="csv/house_prices.csv",
    n_estimators = 90,
    lr=0.05
)
ensemble_Bayes.fit(original_data, individual_objective)
predicciones = ensemble_Bayes.predict(original_data, individual_objective)
print('R2 = ', predicciones[1])

R2 =  0.5726266767888912


In [33]:
ensemble_kNN = SequentialEnsemble(
    trainingModel=KNeighborsRegressor(n_neighbors=5, weights='distance'),
    csv="csv/house_prices.csv",
    n_estimators = 90,
    lr=0.05
)
ensemble_kNN.fit(original_data, individual_objective)
predicciones = ensemble_kNN.predict(original_data, individual_objective)
print('R2 = ', predicciones[1])

R2 =  0.9397858072321665


In [34]:
ensemble_Gaussian = SequentialEnsemble(
    trainingModel=GaussianProcessRegressor(),
    csv="csv/house_prices.csv",
    n_estimators = 90,
    lr=0.05
)
ensemble_Gaussian.fit(original_data, individual_objective)
predicciones = ensemble_Gaussian.predict(original_data, individual_objective)
print('R2 = ', predicciones[1])

R2 =  0.9988619263553329


In [4]:
transform_csv('csv/house_prices.csv')[0]

Unnamed: 0,GarageCars,Condition2,YearBuilt,GarageYrBlt,LandContour,LowQualFinSF,HouseStyle,GarageType,MSSubClass,WoodDeckSF,...,SaleType,MiscVal,BsmtExposure,OpenPorchSF,ExterCond,Fireplaces,FullBath,BsmtQual,MiscFeature,PoolQC
0,2,2.0,1962,1977.0,3.0,0,2.0,5.0,20,0,...,7.0,0,3.0,0,3.0,0,1,3.0,4.0,2.0
1,0,2.0,1914,0.0,3.0,0,4.0,6.0,75,0,...,7.0,0,3.0,291,3.0,1,2,3.0,4.0,2.0
2,2,2.0,1999,1999.0,3.0,0,2.0,1.0,20,0,...,7.0,0,0.0,35,3.0,0,2,2.0,4.0,2.0
3,1,2.0,1948,1948.0,0.0,0,5.0,1.0,20,103,...,7.0,0,3.0,0,1.0,0,3,3.0,4.0,2.0
4,2,2.0,1950,1950.0,3.0,0,2.0,5.0,20,0,...,7.0,0,3.0,29,3.0,0,1,4.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,1,2.0,1959,1959.0,3.0,0,7.0,1.0,80,86,...,7.0,0,0.0,0,1.0,1,1,3.0,4.0,2.0
556,1,2.0,1934,1939.0,1.0,0,5.0,5.0,70,0,...,7.0,0,3.0,0,3.0,1,1,3.0,4.0,2.0
557,2,2.0,1882,1925.0,3.0,0,5.0,1.0,70,0,...,7.0,0,3.0,169,1.0,1,1,3.0,4.0,2.0
558,1,2.0,1953,1953.0,3.0,0,2.0,1.0,20,0,...,7.0,0,3.0,18,3.0,0,1,3.0,4.0,2.0


In [None]:
# TESTING PLAYGROUND
originalData = pd.read_csv('csv/house_prices.csv')
#data.iloc[0]
#data.iloc[0]['GarageCars']
originalData.columns
#data.iloc[0,1]
#key = data.iloc[0,:1]
#key
#is_string_dtype(data[data.columns[4]])
#is_string_dtype(data[data.columns[4]])
#data.columns[0]
originalData["PoolQC"]

0      none
1      none
2      none
3      none
4      none
       ... 
555    none
556    none
557    none
558    none
559    none
Name: PoolQC, Length: 560, dtype: object