# 12. Scikit-Learn Machine Learning Pipeline

## Required steps
### Pre-process
    - [X] Categorical to Numeric
    - [X] Clean Numeric Data
    - [X] Outlier Detection
    - [X] Data Transformation
    - [X] Data Normalization
    - [X] Dimensionality Reduction
    
### Training steps - models
    - [ ] XGBoost Regression
    - [ ] Lasso Regression
    - [ ] ANN Regression
    - [ ] Random Forest Regression
    - [ ] ElasticNet
    - [ ] Bayesian Ridge Regression
    - [ ] LassoLarsIC Regression
    
### Stacking
    - [ ] Picking one of the traning models as Meta-model
    - [ ] Use the rest for training the evaluation set and test on the test set.
    - [ ] Use Meta-model to predict the test set based on the trained models results.
    
### Ensembling [?]

---------------
## Implementation
## Pre-process classes definition

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

### Convert Categorical columns to Numeric

In [2]:
import json 
import category_encoders as ce
# categorical missing value imputer
class CatToNum(BaseEstimator, TransformerMixin):

    def __init__(self, dict_address, continuesVars, descreteVars_Ordinal, descreteVars_Nominal):
        self.continuesVars = continuesVars
        self.descreteVars_Ordinal = descreteVars_Ordinal
        self.descreteVars_Nominal = descreteVars_Nominal
        
        fileName = dict_address
        f = open(fileName,'r')
        self.conversion_dict = json.loads(f.read())
        

    def fit(self, X, y=None):
        # we need the fit statement to accomodate the sklearn pipeline
        des_nom_DF = X[self.descreteVars_Nominal]
        # Map Nominal Categorical data to Numerical
        cat_nom_DF = des_nom_DF.fillna('NULL').astype(str)
        self.ce_binary = ce.BinaryEncoder()
        self.ce_binary.fit(cat_nom_DF)
        return self

    def transform(self, X):
        totalDF = X.copy()
        conDF = X[self.continuesVars]
        des_ord_DF = X[self.descreteVars_Ordinal]
        des_nom_DF = X[self.descreteVars_Nominal]
        
        # Map Ordinal Categorical data to Numerical
        cat_ord_DF_numerical = des_ord_DF.copy()
        for feature in self.conversion_dict:
            temp_dict = conversion_dict[feature]
            if ('NA' in temp_dict): # Replace 'NA' with np.nan
                temp_dict[np.nan] = temp_dict.pop('NA')
            cat_ord_DF_numerical[feature] = des_ord_DF[feature].map(temp_conversion_dict)
            
        totalDF[self.descreteVars_Ordinal] = cat_ord_DF_numerical
            
        # Map Nominal Categorical data to Numerical
        cat_nom_DF = des_nom_DF.fillna('NULL').astype(str)
        cat_nom_DF_numerical = self.ce_binary.transform(cat_nom_DF)
        totalDF[self.descreteVars_Nominal] = cat_nom_DF_numerical
        
        cols_to_drop = [x for x in list(cat_nom_DF_numerical) if ('_0' in x)]
        totalDF.drop(cols_to_drop, axis=1, inplace=True)

        return totalDF


### Clean Numeric Data

In [3]:
class CleanNum(BaseEstimator, TransformerMixin):

    def __init__(self, drop_thresh=0.8):
        self.drop_thresh = drop_thresh

    def fit(self, X, y=None):
        train_df = X.copy()
        # Remove columns with dominance bigger than a threshold
        self.drop_list = []
        for feature in train_df:
            col_df = train_df[feature]
            count_nan = col_df.isnull().sum()
            nan_ratio = count_nan/len(col_df)   
            repeats = train_df.pivot_table(index=[feature], aggfunc='size').sort_values()
            max_repeat_ratio = repeats.max()/len(col_df)
            if (nan_ratio>self.drop_thresh or max_repeat_ratio>self.drop_thresh):
                self.drop_list.append(feature)

        # Replace null values with average (or mode) of the train columns
        self.cols_average = df.mean(axis = 1) 
        self.cols_mode = df.mode(axis = 1)
        
        return self

    def transform(self, X, measure='mean'):
        totalDF = X.copy()
        
        # Replace null values with average (or mode) of the train columns
        for col in totalDF:
            if (measure=='mean'):
                totalDF[col].fillna(self.cols_average[col], inplace=True)
            if (measure=='mode'):
                totalDF[col].fillna(self.cols_mode[col], inplace=True)
        
        # Remove columns with dominance bigger than a threshold
        totalDF.drop(self.drop_list, axis=1, inplace=True)

        return totalDF

### Outlier Detection

In [4]:
class OutlierDetection(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

        # Function to Detection Outlier on one-dimentional datasets.
    def find_anomalies(data):
        anomalies_idx = []
        # Set upper and lower limit to 3 standard deviation
        data_std = data.std()
        data_mean = data.mean()
        anomaly_cut_off = data_std * 3

        lower_limit  = data_mean - anomaly_cut_off 
        upper_limit = data_mean + anomaly_cut_off
        # Generate outliers
        for idx in range(len(data)):
            outlier = data[idx]
            if outlier > upper_limit or outlier < lower_limit:
                anomalies_idx.append(idx)
        return anomalies_idx

    def fit(self, X, y=None):
        train_df = X.copy()
        outliers_full_list = []
        for feature in train_df:
            outliers_full_list = outliers_full_list + find_anomalies(train_df[feature].values)

        self.outliers_unique = list(set(outliers_full_list))
        return self

    def transform(self, X):
        totalDF = X.copy()
        totalDF.drop(outliers_unique, inplace=True)
        return totalDF

### Data Transformation (Box-Cox transformation)

In [5]:
from scipy.special import boxcox1p

class DataTransformation(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None, thresh=0.75, lam=0.15):
        self.lam = lam
        train_df = X.copy
        # Check the skew of all numerical features
        skewed_feats = skew(train_df).sort_values(ascending=False)
        skewness = pd.DataFrame({'Skew' :skewed_feats})
        skewness = skewness[abs(skewness) > thresh]
        self.skewed_features_list = list(skewness)
        return self
    
    def transform(self, X):
        totalDF = X.copy()
        skewed_features = totalDF[self.skewed_features_list]
        for feat in skewed_features:
            #all_data[feat] += 1
            totalDF[feat] = boxcox1p(totalDF[feat], self.lam)
        return totalDF

### Data Normalization
There is no need to define a specific class for data normalization.

### Dimensionality Reduction
I am thinking there is no need for dimensionality reduction at the moment. But I may change my mind later...

----

In [6]:
n_folds = 5

def rmsle_cv(model, train, y_train):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

------

## Training steps - models

In [7]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [8]:
train_file = 'house-prices-advanced-regression-techniques/train.csv'
train_df = pd.read_csv(train_file)
test_file = 'house-prices-advanced-regression-techniques/test.csv'
test_df = pd.read_csv(test_file)





descreteVars_Nominal = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood',
                        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                        'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
                        'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']

descreteVars_Ordinal = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual',
                        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                        'KitchenQual','Functional',  'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                        'PavedDrive', 'PoolQC', 'Fence']

resultVar = ['SalePrice']



continuesVars = list(set(list(train_df)) - set(descreteVars_Nominal) - set(descreteVars_Ordinal) - set(resultVar))

dict_address='cat_ord_dict.txt'


X = train_df.drop(resultVar, axis=1)
y = train_df[resultVar]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
lasso = make_pipeline(CatToNum(dict_address, continuesVars, descreteVars_Ordinal, descreteVars_Nominal), CleanNum(), OutlierDetection(), DataTransformation(), RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [10]:
# train = X
# y_train = y
# score = rmsle_cv(lasso, train, y_train)
# print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))