# House Prices solution with Pipeline 
- Pipeline (make_pipeline)
- Basic Transformers (SimpleImputer, RobustScaler)
- Linear Regression
- Performance Measure - R2

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

import sklearn.model_selection as model_selection

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score

In [29]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [31]:
num_cols = train.drop(['SalePrice','Id'], axis=1).select_dtypes("number").columns
cat_cols = train.select_dtypes("object").columns

In [32]:
num_cols

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [33]:
X = train.select_dtypes("number").drop(['SalePrice','Id'], axis=1)
y = train.SalePrice

In [34]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

In [35]:
#https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/

In [36]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

In [37]:
preprocessor = make_column_transformer(
    
    (make_pipeline(SimpleImputer(strategy = 'median'), 
                   RobustScaler()),num_cols),
    
    #(make_pipeline(SimpleImputer(strategy = 'most_frequent'),
                   #OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')),cat_cols)
    
)

In [38]:
linearModel = make_pipeline(preprocessor, LinearRegression())

In [39]:
print(f'Train R2 score is: {cross_val_score(linearModel, X_train, y_train).mean():.4f}')
print(f'Test R2 score is: {cross_val_score(linearModel, X_test, y_test).mean():.4f}')

Train R2 score is: 0.6436
Test R2 score is: 0.5480


In [40]:
linearModel.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                             

In [41]:
X_submission = test[num_cols]
X_submission.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [42]:
prediction = linearModel.predict(X_submission)

In [44]:
#Create Submission dataframe 
submission_df = pd.DataFrame({'Id' : test['Id'], 'SalePrice' : prediction})

submission_df.to_csv('Regression_Pipeline.csv', index = False)