# 12. Scikit-Learn Machine Learning Pipeline

## Required steps
### Pre-process
    - [X] Categorical to Numeric
    - [X] Clean Numeric Data
    - [X] Outlier Detection
    - [X] Data Transformation
    - [X] Data Normalization
    - [X] Dimensionality Reduction
    
### Training steps - models
    - [X] Kernel Ridge
    - [X] XGBoost Regression
    - [X] Lasso Regression
    - [X] ANN Regression
    - [X] Random Forest Regression
    - [X] ElasticNet
    - [X] Bayesian Ridge Regression
    - [X] LassoLarsIC Regression
    - [X] Gradient Boosting Regression
    
### Stacking
    - [ ] Picking one of the traning models as Meta-model
    - [ ] Use the rest for training the evaluation set and test on the test set.
    - [ ] Use Meta-model to predict the test set based on the trained models results.
    
### Ensembling [?]

---------------------

Connect to Google Drive - mahdi.shooshtari@gmail.com

In [1]:
from google.colab import drive
drive.mount('/content/drive')
main_folder = '/content/drive/My Drive/Kaggle_House_Prices'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


---------------

## Implementation
## Pre-process classes definition

In [0]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

### Convert Categorical columns to Numeric

In [3]:
! python -m pip install --upgrade pip
! pip install category_encoders
import json 
import category_encoders as ce
# categorical missing value imputer
class CatToNum(BaseEstimator, TransformerMixin):

    def __init__(self, dict_address, continuesVars, descreteVars_Ordinal, descreteVars_Nominal):
        self.continuesVars = continuesVars
        self.descreteVars_Ordinal = descreteVars_Ordinal
        self.descreteVars_Nominal = descreteVars_Nominal
        
        fileName = dict_address
        f = open(fileName,'r')
        self.conversion_dict = json.loads(f.read())
        

    def fit(self, X, y=None):
        # we need the fit statement to accomodate the sklearn pipeline
        des_nom_DF = X[self.descreteVars_Nominal]
        # Map Nominal Categorical data to Numerical
        cat_nom_DF = des_nom_DF.fillna('NULL').astype(str)
        self.ce_binary = ce.BinaryEncoder()
        self.ce_binary.fit(cat_nom_DF)
        return self

    def transform(self, X):
        totalDF = X.copy()
        conDF = X[self.continuesVars]
        des_ord_DF = X[self.descreteVars_Ordinal]
        des_nom_DF = X[self.descreteVars_Nominal]
        
        # Map Ordinal Categorical data to Numerical
        cat_ord_DF_numerical = des_ord_DF.copy()
        for feature in self.conversion_dict:
            temp_dict = self.conversion_dict[feature]
            if ('NA' in temp_dict): # Replace 'NA' with np.nan
                temp_dict[np.nan] = temp_dict.pop('NA')
            cat_ord_DF_numerical[feature] = des_ord_DF[feature].map(temp_dict)
            
        totalDF[self.descreteVars_Ordinal] = cat_ord_DF_numerical
            
        # Map Nominal Categorical data to Numerical
        cat_nom_DF = des_nom_DF.fillna('NULL').astype(str)
        cat_nom_DF_numerical = self.ce_binary.transform(cat_nom_DF)
        totalDF = pd.concat([totalDF, cat_nom_DF_numerical], axis=1)
        totalDF.drop(self.descreteVars_Nominal, axis=1, inplace=True)
        cols_to_drop = [x for x in list(cat_nom_DF_numerical) if ('_0' in x)]
        totalDF.drop(cols_to_drop, axis=1, inplace=True)

        return totalDF


Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (19.3.1)


### Clean Numeric Data

In [0]:
class CleanNum(BaseEstimator, TransformerMixin):

    def __init__(self, measure='mean', drop_thresh=0.8):
        self.measure = measure
        self.drop_thresh = drop_thresh

    def fit(self, X, y=None):
        train_df = X.copy().astype(float)
        # Remove columns with dominance bigger than a threshold
        self.drop_list = []
        for feature in train_df:
            col_df = train_df[feature]
            count_nan = col_df.isnull().sum()
            nan_ratio = count_nan/len(col_df)   
            repeats = train_df.pivot_table(index=[feature], aggfunc='size').sort_values()
            max_repeat_ratio = repeats.max()/len(col_df)
            if (nan_ratio>self.drop_thresh or max_repeat_ratio>self.drop_thresh):
                self.drop_list.append(feature)

        # Replace null values with average (or mode) of the train columns
        self.cols_average = train_df.mean(axis = 0)
        self.cols_mode = train_df.mode(axis = 0)
        
        return self

    def transform(self, X):
        totalDF = X.copy().astype(float)
        
        # Replace null values with average (or mode) of the train columns
        for col in totalDF:
            if (self.measure=='mean'):
                totalDF[col] = totalDF[col].fillna(self.cols_average[col])
            if (self.measure=='mode'):
                totalDF[col] = totalDF[col].fillna(self.cols_mode[col])
        
        # Remove columns with dominance bigger than a threshold
        totalDF.drop(self.drop_list, axis=1, inplace=True)

        return totalDF

### Outlier Detection

In [0]:
class OutlierDetection(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

        # Function to Detection Outlier on one-dimentional datasets.
    def find_anomalies(self,data):
        anomalies_idx = []
        # Set upper and lower limit to 3 standard deviation
        data_std = data.std()
        data_mean = data.mean()
        anomaly_cut_off = data_std * 3

        lower_limit  = data_mean - anomaly_cut_off 
        upper_limit = data_mean + anomaly_cut_off
        # Generate outliers
        for idx in range(len(data)):
            outlier = data[idx]
            if outlier > upper_limit or outlier < lower_limit:
                anomalies_idx.append(idx)
        return anomalies_idx

    def fit(self, X, y=None):
        train_df = X.copy()
        outliers_full_list = []
        for feature in train_df:
            outliers_full_list = outliers_full_list + self.find_anomalies(train_df[feature].values)

        self.outliers_unique = list(set(int(outliers_full_list)))
        return self

    def transform(self, X):
        totalDF = X.copy()
        totalDF.drop(self.outliers_unique, inplace=True)
        return totalDF

### Data Transformation (Box-Cox transformation)

In [0]:
from scipy.special import boxcox1p
from scipy.stats import norm, skew

class DataTransformation(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None, thresh=0.75, lam=0.15):
        self.lam = lam
        train_df = X.copy()
        # Check the skew of all numerical features
#         print(type(skew(train_df)))
        skewed_feats = train_df.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
#         print(type(skewed_feats ))
        skewness = pd.DataFrame({'Skew' :skewed_feats})
        skewness = skewness[abs(skewness) > thresh]
        self.skewed_features_idx_list = skewness.index
        return self
    
    def transform(self, X):
        totalDF = X.copy()
#         skewed_features = totalDF[self.skewed_features_list]
        for feat_idx in self.skewed_features_idx_list:
            #all_data[feat] += 1
            totalDF[feat_idx] = boxcox1p(totalDF[feat_idx], self.lam)
        return totalDF

### Data Normalization
There is no need to define a specific class for data normalization.

### Dimensionality Reduction
I am thinking there is no need for dimensionality reduction at the moment. But I may change my mind later...

----

## Artificial Neural Networks

In [7]:
import keras
from keras.models import Sequential
from keras.layers import Dense

class ANN(BaseEstimator, TransformerMixin):

    def __init__(self, optimizer='adam', loss='mean_squared_error', batch_size=100, nb_epoch=2000):
        self.optimizer = optimizer
        self.loss = loss
        self.batch_size = batch_size
        self.nb_epoch = nb_epoch
    
    def fit(self, X, y=None):
        regressor = Sequential()
        # Adding the input layer and the first hidden layer 
        # Number of nodes in each layer: Average of number of input nodes and ouput nodes
        # In our case, we have 42 input nodes and 2 classes, therfore 1 output node, so the average is 22 or 21.
        (rows, cols) = X.shape
        numberOfInputNodes = cols
        numberOfOutputNodes = 1
        numberOfMiddleNodes = int((numberOfInputNodes + numberOfOutputNodes)/2)
        regressor.add(Dense(input_dim=numberOfInputNodes, output_dim=numberOfMiddleNodes, activation='relu'))
        regressor.add(Dense(output_dim=numberOfMiddleNodes, activation='relu'))
        regressor.add(Dense(output_dim=numberOfMiddleNodes, activation='relu'))
        regressor.add(Dense(output_dim=numberOfOutputNodes, activation='linear'))

        # Compiling the ANN
        regressor.compile(optimizer=self.optimizer, loss=self.loss)
        # Fitting the ANN to the training set
        regressor.fit(X, y, batch_size=self.batch_size, nb_epoch=self.nb_epoch)
        self.regressor = regressor
        
        return self
    
    def transform(self, X):
        pass
    
    def predict(self, X):
      return self.regressor.predict(X)


Using TensorFlow backend.


-----------------------------------

In [0]:
n_folds = 5

def rmsle_cv(model, train, y_train):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

------

## Training steps - models

In [0]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

In [0]:
train_file = main_folder+'/house-prices-advanced-regression-techniques/train.csv'
train_df = pd.read_csv(train_file)
test_file = main_folder+'/house-prices-advanced-regression-techniques/test.csv'
test_df = pd.read_csv(test_file)





descreteVars_Nominal = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood',
                        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                        'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
                        'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']

descreteVars_Ordinal = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual',
                        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                        'KitchenQual','Functional',  'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                        'PavedDrive', 'PoolQC', 'Fence']

resultVar = ['SalePrice']



continuesVars = list(set(list(train_df)) - set(descreteVars_Nominal) - set(descreteVars_Ordinal) - set(resultVar))

dict_address= main_folder+'/cat_ord_dict.txt'


X = train_df.drop(resultVar, axis=1)
y = train_df[resultVar]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [0]:
models_names = ['KernelRidge', 'ElasticNet', 'Lasso', 'GradientBoostingRegressor', 'BayesianRidge', 'LassoLarsIC', 'RandomForestRegressor', 'XGBRegressor', 'ANN']
models_list = [KernelRidge(), ElasticNet(), Lasso(), GradientBoostingRegressor(), BayesianRidge(), LassoLarsIC(), RandomForestRegressor(), XGBRegressor(), ANN()]


KR_param_grid = {'alpha': 0.1, 'coef0': 100, 'degree': 1, 'gamma': None, 'kernel': 'polynomial'}
EN_param_grid = {'alpha': 0.001, 'copy_X': True, 'l1_ratio': 0.6, 'fit_intercept': True, 'normalize': False, 
                         'precompute': False, 'max_iter': 300, 'tol': 0.001, 'selection': 'random', 'random_state': None}
LASS_param_grid = {'alpha': 0.0005, 'copy_X': True, 'fit_intercept': True, 'normalize': False, 'precompute': False, 
                    'max_iter': 300, 'tol': 0.01, 'selection': 'random', 'random_state': None}
GB_param_grid = {'loss': 'huber', 'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 3, 
                                        'min_samples_split': 0.0025, 'min_samples_leaf': 5}
BR_param_grid = {'n_iter': 200, 'tol': 0.00001, 'alpha_1': 0.00000001, 'alpha_2': 0.000005, 'lambda_1': 0.000005, 
                 'lambda_2': 0.00000001, 'copy_X': True}
LL_param_grid = {'criterion': 'aic', 'normalize': True, 'max_iter': 100, 'copy_X': True, 'precompute': 'auto', 'eps': 0.000001}
RFR_param_grid = {'n_estimators': 50, 'max_features': 'auto', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2}
XGB_param_grid = {'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 300, 'booster': 'gbtree', 'gamma': 0, 'reg_alpha': 0.1,
                  'reg_lambda': 0.7, 'max_delta_step': 0, 'min_child_weight': 1, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.2,
                  'scale_pos_weight': 1}
ANN_param_grid = {'optimizer': 'adam', 'loss': 'mean_squared_error'}
params_grid = [KR_param_grid, EN_param_grid, LASS_param_grid, GB_param_grid, BR_param_grid, LL_param_grid, RFR_param_grid, XGB_param_grid, ANN_param_grid]

In [0]:
def cal_rmse(model, X, y):
    return (np.sqrt(mean_squared_error(y, model.predict(X))))

In [13]:
original_test_file = main_folder+'/house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']

for i in range(len(models_list)):
    print(models_names[i])
    model = models_list[i]
    params = params_grid[i]
    print(params)
    pipeline = make_pipeline(CatToNum(dict_address, continuesVars, descreteVars_Ordinal, descreteVars_Nominal), CleanNum(), RobustScaler(), model.set_params(**params))
    pipeline_transformed = make_pipeline(CatToNum(dict_address, continuesVars, descreteVars_Ordinal, descreteVars_Nominal), CleanNum(), DataTransformation(), RobustScaler(), model.set_params(**params))

    pipeline.fit(X_train, y_train)    
    pipeline_transformed.fit(X_train, y_train)    
    
    print('Normal ' + models_names[i])
    print('Train rmse: ')
    print(cal_rmse(pipeline, X_train, y_train))
    print('Validation rmse: ')
    print(cal_rmse(pipeline, X_val, y_val))
    print(models_names[i]+' with transformed skewed columns')
    print('Train rmse: ')
    print(cal_rmse(pipeline_transformed, X_train, y_train))
    print('Validation rmse: ')
    print(cal_rmse(pipeline_transformed, X_val, y_val))
    print('-----------------------------------------------------')
    
    Y_pred = pipeline.predict(test_df)
    result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
    result_df.to_csv(main_folder+'/Results/'+models_names[i]+'.csv', index=False)
    
    
    Y_pred_transformed = pipeline_transformed.predict(test_df)
    result_df = pd.concat([id_col, pd.DataFrame(Y_pred_transformed, columns=['SalePrice'])], axis=1)
    result_df.to_csv(main_folder+'/Results/'+models_names[i]+'_transformed.csv', index=False)
    
    


KernelRidge
{'alpha': 0.1, 'coef0': 100, 'degree': 1, 'gamma': None, 'kernel': 'polynomial'}
Normal KernelRidge
Train rmse: 
40477.34158385059
Validation rmse: 
33752.360766792684
KernelRidge with transformed skewed columns
Train rmse: 
33964.609106315176
Validation rmse: 
35648.87117435537
-----------------------------------------------------
ElasticNet
{'alpha': 0.001, 'copy_X': True, 'l1_ratio': 0.6, 'fit_intercept': True, 'normalize': False, 'precompute': False, 'max_iter': 300, 'tol': 0.001, 'selection': 'random', 'random_state': None}


  positive)
  positive)


Normal ElasticNet
Train rmse: 
41328.02318816551
Validation rmse: 
34811.95195096295
ElasticNet with transformed skewed columns
Train rmse: 
33899.324921881314
Validation rmse: 
35852.45000784021
-----------------------------------------------------
Lasso
{'alpha': 0.0005, 'copy_X': True, 'fit_intercept': True, 'normalize': False, 'precompute': False, 'max_iter': 300, 'tol': 0.01, 'selection': 'random', 'random_state': None}


  positive)
  positive)


Normal Lasso
Train rmse: 
41398.80631556008
Validation rmse: 
34913.60269676187
Lasso with transformed skewed columns
Train rmse: 
33898.31723469382
Validation rmse: 
35891.34499955318
-----------------------------------------------------
GradientBoostingRegressor
{'loss': 'huber', 'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 3, 'min_samples_split': 0.0025, 'min_samples_leaf': 5}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Normal GradientBoostingRegressor
Train rmse: 
30253.733978778513
Validation rmse: 
33468.52294593436
GradientBoostingRegressor with transformed skewed columns
Train rmse: 
12473.763555057947
Validation rmse: 
25344.74719666767
-----------------------------------------------------
BayesianRidge
{'n_iter': 200, 'tol': 1e-05, 'alpha_1': 1e-08, 'alpha_2': 5e-06, 'lambda_1': 5e-06, 'lambda_2': 1e-08, 'copy_X': True}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Normal BayesianRidge
Train rmse: 
39673.49024147198
Validation rmse: 
32803.3410605044
BayesianRidge with transformed skewed columns
Train rmse: 
34136.25917629524
Validation rmse: 
35605.25461046546
-----------------------------------------------------
LassoLarsIC
{'criterion': 'aic', 'normalize': True, 'max_iter': 100, 'copy_X': True, 'precompute': 'auto', 'eps': 1e-06}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Normal LassoLarsIC
Train rmse: 
39190.286795731176
Validation rmse: 
33867.88059629625
LassoLarsIC with transformed skewed columns
Train rmse: 
36237.789598564945
Validation rmse: 
37736.43770937156
-----------------------------------------------------
RandomForestRegressor
{'n_estimators': 50, 'max_features': 'auto', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2}


  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


Normal RandomForestRegressor
Train rmse: 
33429.609976855696
Validation rmse: 
35574.32166188438
RandomForestRegressor with transformed skewed columns
Train rmse: 
14890.528668429139
Validation rmse: 
26848.738659118437
-----------------------------------------------------
XGBRegressor
{'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 300, 'booster': 'gbtree', 'gamma': 0, 'reg_alpha': 0.1, 'reg_lambda': 0.7, 'max_delta_step': 0, 'min_child_weight': 1, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.2, 'scale_pos_weight': 1}
Normal XGBRegressor
Train rmse: 
31068.919521136166
Validation rmse: 
35756.92039399908
XGBRegressor with transformed skewed columns
Train rmse: 
10877.737541056262
Validation rmse: 
24250.32936816531
-----------------------------------------------------
ANN
{'optimizer': 'adam', 'loss': 'mean_squared_error'}











Epoch 1/2000





Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72



Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E