# ML pipeline

After the data exploration in the "House_prices" notebook I can streamline my preprocessing and training into a ML Pipeline. 
Following the blog here I write some custom transformers to preprocess the data. 
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
Then combine with one hot encoders/imputation/scalers etc to build a pipeline.

I then apply the combined pipeline to the kaggle test set and make submissions. 

In [156]:
import pandas as pd
import numpy as np 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

from category_encoders import OneHotEncoder

# possible models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

# read data and split into X,y
data = pd.read_csv('C:/Users/Jacob/Dropbox/Side Projects/Kaggle/House Prices in Ames, Iowa/house-prices-advanced-regression-techniques/train.csv',index_col=0)
X = data.drop('SalePrice', axis = 1)
y = data.pop('SalePrice').to_numpy().reshape(-1, 1).astype('float')   # not normalising this at the mo.
scaler_y = StandardScaler()
y = pd.DataFrame(scaler_y.fit_transform(y))
y = y.values.ravel()

# The data has 80 columns so this stops pandas from supressing the full output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [157]:
# Categorical pre-processing
# Custom transformer that drops PoolQC, Fence, MiscFeature because they have too many NaNs
# Fills alley NaN's with 'None'
# Some NaNs in other features but these are dealt with later
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    # Don't need class constructor
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        X.fillna('None', inplace=True)
        X.loc[:,'MSSubClass'] = X.loc[:,'MSSubClass'].apply(str) # convert MSSubClass to string so it can be one hot encoded
        
        return X.values

#Defining the steps in the categorical pipeline
categorical_features = ['MSSubClass'] + list(X.select_dtypes(include=object))
categorical_pipeline = Pipeline( steps = [     ( 'cat_transformer', CategoricalTransformer() ), 
                                  ( 'one_hot_encoder', OneHotEncoder() ) ] )


In [158]:
# Numerical preprocessing
# Custom transformer that converts to float, drops 'GarageYrBlt','1stFlrSF','TotRmsAbvGrd','GarageCars' due to strong correlation with other features, 
# and fills in NaNs in LotFrontage, MasVnrArea with the mean values for each row
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Don't need class constructor
    
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        # make all entries floats
        X.astype('float64', inplace=True)
        
        # drop some columns
        X.drop(columns=['GarageYrBlt','1stFlrSF','TotRmsAbvGrd','GarageCars'], inplace=True)
        
        return X.values

#Defining the steps in the numerical pipeline     
numerical_features = list(X.select_dtypes(exclude=object))
numerical_features.remove('MSSubClass')
numerical_pipeline = Pipeline( steps = [ 
                                  
                                  ( 'num_transformer', NumericalTransformer() ),
                                  
                                  ( 'imputer', SimpleImputer(strategy = 'median') ),
                                  
                                  ( 'std_scaler', StandardScaler() ) ] )

In [159]:
# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)])


In [160]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 42 )

#The full pipeline as a step in another pipeline with an estimator as the final step
full_pipeline = Pipeline( steps = [  ( 'preprocessor', preprocessor),
                                  ( 'model', GradientBoostingRegressor(n_estimators = 750) ) ] )

#Can call fit on it just like any other pipeline
full_pipeline.fit( X_train, y_train )

#Can predict with it like any other pipeline
y_pred = full_pipeline.predict( X_test ) 

# def kaggle_score(y_true, y_pred):
#     return np.sqrt(mean_squared_error(np.log(y_pred), np.log(y_true)))

def kaggle_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log(scaler_y.inverse_transform([y_pred])), np.log(scaler_y.inverse_transform([y_true]))))

print('The kaggle score on the test set is:',kaggle_score(y_test, y_pred))

The kaggle score on the test set is: 0.13755252408330151


In [153]:
# GridSearchCV

kaggle_scorer = make_scorer(kaggle_score, greater_is_better=False)

# helper function to display CV results.
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
        
parameters = {}
parameters['model__n_estimators'] = [1000, 750, 500]

cv = GridSearchCV(full_pipeline, parameters, cv=5, scoring=kaggle_scorer)
cv.fit(X_train, y_train)

print_results(cv)


BEST PARAMS: {'model__n_estimators': 800}

-0.13 (+/-0.03) for {'model__n_estimators': 800}
-0.13 (+/-0.032) for {'model__n_estimators': 750}
-0.13 (+/-0.029) for {'model__n_estimators': 700}


In [163]:
### Kaggle test set
X_test_kaggle = pd.read_csv('C:/Users/Jacob/Dropbox/Side Projects/Kaggle/House Prices in Ames, Iowa/house-prices-advanced-regression-techniques/test.csv',index_col=0)
y_pred_kaggle = full_pipeline.predict(X_test_kaggle)
y_pred_kaggle = scaler_y.inverse_transform([y_pred_kaggle])

# save submission
submission = pd.read_csv('C:/Users/Jacob/Dropbox/Side Projects/Kaggle/House Prices in Ames, Iowa/house-prices-advanced-regression-techniques/sample_submission.csv',index_col=0)
submission.loc[:,'SalePrice'] = y_pred_kaggle.T
submission.to_csv('submission3.csv')

# sb 1
# 2778 out of 5405. Just behind halfway. Score of 0.13820
# sub 2
# didnt inverse scale so awful score of ~11.
# sub 3
# 2393 out of 5413. Over half way!!!! 0.13327

## Submissions:
1. score of 0.13820. Position 2778 out of 5405. 
2. made error. score got way worse
3. score of 0.13327. Positon 2393 out of 5413. big jump forwards! :)
