In [1]:
# In this notebook we attempt to build xgboost and linear regression models using the sklearn pipeline framework
# At this point nothing special yet. We simply create a class that selects columns, a class that can transform
# numerical columns, and a class that z-score normalises columns.

In [2]:
# standard libraries
import os
import pandas as pd
import numpy as np
# Transformer objects:
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
import xgboost as xgb

In [3]:
# More pipeline stuff
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.cluster import KMeans

In [4]:
#os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'/Users/maxthone/Documents/Personal/Python_Projects/kaggle_housing'

In [5]:
df_train = pd.read_csv('data/input_data/train.csv')
df_test = pd.read_csv('data/input_data/test.csv')

In [6]:
# remove some outliers (shouldn't this be part of the pipeline?)
df_train = df_train.loc[df_train['GrLivArea'] < 4000,:]

In [27]:
# Create a train/test split.
X_train,X_test,y_train,y_test = train_test_split(
    df_train.drop('SalePrice',axis = 1),
    df_train['SalePrice'],
    test_size = 0.3,
    random_state = 0)

In [15]:
class SelectColumnsTransformer(BaseEstimator,TransformerMixin):
    """A DataFrame transformer that provides column selection"""
    
    def __init__(self,columns = []):
        self.columns = columns
    
    def transform(self,X, **transform_params):
        trans = X.loc[:,self.columns].copy()
        return trans
    
    def fit(self, X,y = None, **fit_params):
        return self

In [18]:
class base_transformer(BaseEstimator,TransformerMixin):
    '''basic function to apply transformations, and where no fit is needed'''
    def __init__(self,func):
        self.func = func
    
    def transform(self,X,**transform_params):
        trans = pd.DataFrame(X).apply(self.func).copy()
        return trans
    
    def fit(self, X,y = None, **fit_params):
        return self

In [19]:
class My_Scaler(BaseEstimator,TransformerMixin):
    '''DataFrame transformer that applies normalisation scaling to 
    numerical columns. 
    '''
    def transform(self,X,**transform_params):
        trans = X.apply(lambda x: (x - self.mu_series)/self.sd_series, axis=1).copy()
        return trans
        
    def fit(self, X, y = None, **fit_params):
        self.mu_series = X.apply(lambda x: np.mean(x))
        self.sd_series = X.apply(lambda x: np.std(x))
        return self

In [None]:
# Having defined our  transfomer classes, we can now start building pipelines.
# We start with a basic linear regression and xgboost pipelines. The only thing they do
# is take two numerical variables, take the logarithm of them, and then normalise, before they enter
# the ML algorithm

In [22]:
lin_reg_pipeline = Pipeline([
    ('selector',SelectColumnsTransformer(columns = ['GrLivArea','OverallQual'])),
    ('log_transform',base_transformer(np.log)),
    ('scaler',My_Scaler()),
    ('lin_reg',LinearRegression())])

In [23]:
pipe_test = lin_reg_pipeline.fit(X_train,y_train)
pipe_test.score(X_test,y_test)

  linalg.lstsq(X, y)


0.6913712339273694

In [24]:
xg_boost_pipeline = Pipeline([
    ('selector',SelectColumnsTransformer(columns = ['GrLivArea','OverallQual'])),
    ('log_transform',base_transformer(np.log)),
    ('scaler',My_Scaler()),
    ('xgb_reg',xgb.XGBRegressor())
])

In [25]:
xgb_test = xg_boost_pipeline.fit(X_train,y_train)
xgb_test.score(X_test,y_test)

0.803562830166198

In [None]:
# Already with two variables and not doing any tuning we see that the XGBoost algorithm greatly outperforms the 
# linear regression algorithm, due to it being able to capture more non-linear relationships between the independent
# and dependent variables.

In [None]:
# Next we are going to add in a categorical variable. 
# A logical one to consider is Neighborhoods, since big determinant of the price of a house. 
# What we are going to do: At the end we need a one hot encoder: One hot encoder only takes numpy arrays
# with the shape of (length(vector),1), so we need an array reshaper to do that for us
# We also use a shelf Label Encoder that transforms the category string labels to numerical labels
# For some reason this Label Encoder does not work directly in a pipeline, so we need to create a wrapper around it
# so that it does.

# Finally, Neighborhoods has a pretty high cardinality (25 different neighborhoods). So we can apply a Kmeans algorithm
# that groups the neighborhoods up by sale price data in the training set.

# In summary: from sklearn we use LabelEncoder, OneHotEncoder and KMeans
# We create some transformer classes so that we can implement these classes in our pipelines

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans

In [None]:
class array_reshaper(BaseEstimator, TransformerMixin):
    '''reshape transformer to reshape numpy arrays '''
    
    def transform(self, X, ** transform_params):
        X = np.array(X)
        trans = X.reshape([X.shape[0],1])
        return trans
    
    def fit(self, X, y = None, **fit_params):
        return self

In [None]:
class LabelEncoderWrapper(BaseEstimator, TransformerMixin):
    '''wrapper around label encoder, for the reason that label encoder 
    fit_transform does not take in X
    '''
    def fit(self, X, y = None):
        return self
    
    def transform(self,X,y=None):
        enc = LabelEncoder()
        return enc.fit_transform(X)

In [None]:
class cluster_builder(BaseEstimator,TransformerMixin):
    '''
    transformer that returns clusters for a given categorical variable. Useful for
    categorical variables with high cardinality. Clusters are based on y_train.
    '''
    def __init__(self,n_clusters):
        self.clusters = n_clusters
        self.model = KMeans(self.clusters)
    
    def transform(self,X,**transform_params):
        trans = (X.reset_index()
                 .merge(self.df,how = 'left', on = [X.columns.values[0]]).set_index('index'))
        return trans['cluster_grp']
    
    def fit(self,X,y,**fit_params):
        init_df = pd.concat([X,y], axis = 1)
        self.df = init_df.groupby(X.columns.values[0]).agg({y.name:'mean'}).reset_index()
        kmeans_input = np.array(self.df.iloc[:,-1])
        kmeans_input = kmeans_input.reshape([len(kmeans_input),1])
        self.df['cluster_grp'] = self.model.fit_predict(kmeans_input)
        return self

In [None]:
# Having created our new classes, we now can create our pipelines:
# A pipeline for neighborhood where we don't cluster (NB_one_hot_pipeline)
# A pipeline for neighborhood where we DO cluster (NB_cluster_pipeline)
# A pipeline for num variables where we apply log and normalisation transforms (num_pipeline)
# Finally we combine these pipelines, using featureunion

In [151]:
NB_one_hot_pipeline = Pipeline([
    ('selector', SelectColumnsTransformer(columns = ['Neighborhood'])),
    ('label_transform',LabelEncoderWrapper()),
    ('array_reshape',array_reshaper()),
    ('one_hot',OneHotEncoder(sparse = False)) 
])

In [152]:
NB_cluster_pipeline = Pipeline(
    [('selector',SelectColumnsTransformer(columns=['Neighborhood'])),
     ('build_clusters',cluster_builder(6)),
     ('array_reshape',array_reshaper()),
     ('one_hot',OneHotEncoder(sparse=False))])

In [153]:
num_pipeline = Pipeline([
    ('selector',SelectColumnsTransformer(columns = ['GrLivArea','OverallQual'])),
    ('log_transform',base_transformer(np.log)),
    ('scaler',My_Scaler())
])

In [154]:
Final_pipeline_1 = Pipeline([
    ('union' , FeatureUnion(
    transformer_list = [
        ('Neighborhood_one_hot',NB_one_hot_pipeline),
        ('num_pipeline', num_pipeline)])),
    ('xgb_reg',xgb.XGBRegressor())
])

In [155]:
Final_pipeline_2 = Pipeline([
    ('union' , FeatureUnion(
    transformer_list = [
        ('Neighborhood_cluster',NB_cluster_pipeline),
        ('num_pipeline', num_pipeline)])),
    ('xgb_reg',xgb.XGBRegressor())
])

In [157]:
xgb_test_1 = Final_pipeline_1.fit(X_train,y_train)
xgb_test_1.score(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8879328124580523

In [159]:
xgb_test_2 = Final_pipeline_2.fit(X_train,y_train)
xgb_test_2.score(X_test,y_test)

0.813659246061051

In [150]:
# Using neighborhoods without clusters actually leads to a reduction in 
# in accuracy on test data, compard to not using 
# Neigborhoods at all. This is due to overfitting: There are too many variables
# for the algorithm to fit to with only 1019 data points.

# We see that clustering neighborhoods actually works: Accuracy on test data has improved by 
# 1%.

# However bear in mind that we need to have a separate data set for scoring again: (X_val, y_val), 
# because otherwise you tune the hyperparameters (k-means number of clusters in this example, or XGboost params) 
# too much to your test set.

In [None]:
## WORK IN PROGRESS ##

In [None]:
# Work in progress: transformer that replaces nulls. Mostly for numerical I think, but perhaps can make it more
# general for categorical columns as well. 
class null_replacer(BaseEstimator,TransformerMixin):
    # Removes all columns with a certain ratio of nulls:
    # For numeric columns, we want to replace the numeric value with some numeric_function
    def __init__(self,impute_func):
    
    def transform(self,X,**transform_params):
    
    def fit(self,X,y = None, **fit_params):
        self.num_replacer = X.apply(self.func)

In [None]:
# useful wrapper for around ML algorithms that you want to use within the pipeline (instead of as an end point)
# E.G. when you want to reduce the cardinality of categorical variables using KMeans
class ModelTransformer(BaseEstimator, TransformerMixin):
    '''
    wrapper for around Kmeans, to make it return predictions.
    default behaviour of fit_transform is an array of arrays, which is not
    useful for us. 
    '''
    def __init__(self, model):
        self.model = model
    
    def transform(self, X, **transform_params):
        return self.model.predict(X)
    
    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self