In [22]:
# In this notebook we attempt to build xgboost and linear regression models using the sklearn pipeline framework
# At this point nothing special yet. We simply create a class that selects columns, a class that can transform
# numerical columns, and a class that z-score normalises columns.

In [None]:
# standard libraries
import pandas as pd
import numpy as np
# Transformer objects:
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
import xgboost as xgb

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
# remove some outliers (based on this guy's thing) (shouldn't this be part of the pipeline?)
df_train = df_train.loc[df_train['GrLivArea'] < 4000,:]

In [7]:
# Create a train/test split.
X_train,X_test,y_train,y_test = train_test_split(
    df_train.drop('SalePrice',axis = 1),
    df_train['SalePrice'],
    test_size = 0.3,
    random_state = 0)

In [10]:
class SelectColumnsTransformer(BaseEstimator,TransformerMixin):
    """A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-
    learn pipelines.
    """
    
    def __init__(self,columns = []):
        self.columns = columns
    
    def transform(self,X, **transform_params):
        
        trans = X.loc[:,self.columns].copy()
        return trans
    
    def fit(self, X,y = None, **fit_params):
        
        '''dont do anything
        
        Parameters
        -----------
        X: pandas Dataframe
        y: default None
        
        Returns
        -----------
        self
        
        '''
        return self

In [11]:
class base_transformer(BaseEstimator,TransformerMixin):
    '''basic function to apply transformations, and where no fit is needed'''
    
    def __init__(self,func):
        self.func = func
    
    def transform(self,X,**transform_params):
        trans = pd.DataFrame(X).apply(self.func).copy()
        return trans
    
    def fit(self, X,y = None, **fit_params):
        return self

In [12]:
class My_Scaler(BaseEstimator,TransformerMixin):
    '''DataFrame transformer that applies normalisation scaling to 
    numerical columns. 
    '''
    def transform(self,X,**transform_params):
        trans = X.apply(lambda x: (x - self.mu_series)/self.sd_series, axis=1).copy()
        return trans
        
    def fit(self, X, y = None, **fit_params):
        self.mu_series = X.apply(lambda x: np.mean(x))
        self.sd_series = X.apply(lambda x: np.std(x))
        return self

In [None]:
# Below we have linear regression and xg boost pipeline examples.
# Note that at this stage we only use two (numerical columns), but we actually already get ok results.
# xg boost already clearly out performs the linear regression: Why is that? Something to find out. 

In [18]:
lin_reg_pipeline = Pipeline([
    ('selector',SelectColumnsTransformer(columns = ['GrLivArea','OverallQual'])),
    ('log_transform',base_transformer(np.log)),
    ('scaler',My_Scaler()),
    ('lin_reg',LinearRegression())])

In [19]:
pipe_test = lin_reg_pipeline.fit(X_train,y_train)
pipe_test.score(X_test,y_test)

0.6913712339273694

In [20]:
xg_boost_pipeline = Pipeline([
    ('selector',SelectColumnsTransformer(columns = ['GrLivArea','OverallQual'])),
    ('log_transform',base_transformer(np.log)),
    ('scaler',My_Scaler()),
    ('xgb_reg',xgb.XGBRegressor())
])

In [21]:
xgb_test = xg_boost_pipeline.fit(X_train,y_train)
xgb_test.score(X_test,y_test)

0.803562830166198

In [24]:
# Predictions:
y_pred = xgb_test.predict(df_test)

In [None]:
# Work in progress: transformer that replaces nulls. Mostly for numerical I think, but perhaps can make it more
# general for categorical columns as well. 
class null_replacer(BaseEstimator,TransformerMixin):
    # Removes all columns with a certain ratio of nulls:
    # For numeric columns, we want to replace the numeric value with some numeric_function
    def __init__(self,impute_func):
    
    def transform(self,X,**transform_params):
    
    def fit(self,X,y = None, **fit_params):
        self.num_replacer = X.apply(self.func)