<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Pipeline</a></span></li></ul></div>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('Datasets/trainHousePrices.csv')
cat_cols = data.select_dtypes('object').columns
data.drop(columns=cat_cols, inplace=True)
data.dropna(axis = 0, how ='any',inplace=True) 
data.shape

(1121, 38)

In [3]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns=['SalePrice']), data['SalePrice'], test_size=0.2, random_state=33)
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=5)
rfe.fit(xtrain, ytrain)
feature_list=[]
for i, col in zip(range(xtrain.shape[1]), xtrain.columns):
    if rfe.ranking_[i] == 1:
        feature_list.append(col)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
class Metrics:
    def evaluate(self, model, features, target):
        ypred = model.predict(features)
        mae = mean_absolute_error(y_true=target, y_pred=ypred)
        mse = mean_squared_error(y_true=target, y_pred=ypred)
        r2 = r2_score(y_true=target, y_pred=ypred)*100
        print(f"MAE :: {mae: .4f}")
        print(f"MSE :: {mse: .4f}")
        print(f"R2 :: {r2: .4f}")
        return [np.round(mae, 4), np.round(mse, 4), np.round(r2, 4)]
evaluator = Metrics()

In [56]:
class OutlierTreatment:
    
    def __init__(self, dff):
        self.dff = dff
    
    def outlier(self):
        Q1 = self.dff.quantile(0.25)
        Q3 = self.dff.quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5*IQR
        upper_limit = Q3 + 1.5*IQR
        return lower_limit, upper_limit
        
    def countoutlier(self, dfx):
        self.lower_limit, self.upper_limit = self.outlier()
        Total_outlier = len(dfx[(self.dff <= self.lower_limit)|(self.dff >= self.upper_limit)])
        return Total_outlier
    
    def cleanoutlier(self, dfx, dfy):
        self.lower_limit, self.upper_limit = self.outlier()
        outliers = dfx[(self.dff <= self.lower_limit)|(self.dff >= self.upper_limit)]
        dfx.drop(outliers.index, inplace=True)
        dfy.drop(outliers.index, inplace=True)
        return dfx, dfy

In [57]:
print(xtrain[feature_list].shape)
print(ytrain.shape)
print("After removing outlier")
# checking total outlier in every feature column
for i in xtrain[feature_list].columns:
    d = OutlierTreatment(xtrain[i])
    if d.countoutlier(xtrain) < 5:
        d.cleanoutlier(xtrain, ytrain)
print(xtrain[feature_list].shape)
print(ytrain.shape)

(896, 5)
(896,)
After removing outlier
(891, 5)
(891,)


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [5]:
xtrain1 = xtrain[feature_list].copy()
xtest1 = xtest[feature_list].copy()

In [6]:
scaler = StandardScaler()
# fit and transform are both applied on training data
xtrain[feature_list] = scaler.fit_transform(xtrain[feature_list])
# only transform is applied on test data as features used to fit training data is applied in test data to perform transform 
xtest[feature_list] = scaler.transform(xtest[feature_list])

In [7]:
Linearmodel1 = LinearRegression()
Linearmodel1.fit(xtrain[feature_list], ytrain)

In [8]:
evaluator = Metrics()
evaluator.evaluate(Linearmodel1, xtrain[feature_list], ytrain)

MAE ::  24587.3747
MSE ::  1391110021.6406
R2 ::  78.4110


[24587.3747, 1391110021.6406, 78.411]

In [9]:
evaluator.evaluate(Linearmodel1, xtest[feature_list], ytest)

MAE ::  28971.7497
MSE ::  3278218368.9155
R2 ::  61.4516


[28971.7497, 3278218368.9155, 61.4516]

## Pipeline

In [12]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('ScalerObject', StandardScaler()), ('ModelName', LinearRegression())], verbose=True)
pipeline.fit(xtrain1[feature_list], ytrain)

[Pipeline] ...... (step 1 of 2) Processing ScalerObject, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing ModelName, total=   0.0s


In [13]:
evaluator.evaluate(model=pipeline, features=xtrain1[feature_list], target=ytrain)

MAE ::  24587.3747
MSE ::  1391110021.6406
R2 ::  78.4110


[24587.3747, 1391110021.6406, 78.411]

In [14]:
evaluator.evaluate(model=pipeline, features=xtest1[feature_list], target=ytest)

MAE ::  28971.7497
MSE ::  3278218368.9155
R2 ::  61.4516


[28971.7497, 3278218368.9155, 61.4516]

In [15]:
xtrain1[feature_list].head()

Unnamed: 0,OverallQual,BsmtFinSF1,TotalBsmtSF,2ndFlrSF,GrLivArea
1409,7,0,791,857,2093
1092,6,423,1181,304,1694
1140,5,852,952,0,952
1094,5,248,936,0,936
1013,5,247,1022,0,1022


In [16]:
xtest1[feature_list].head()

Unnamed: 0,OverallQual,BsmtFinSF1,TotalBsmtSF,2ndFlrSF,GrLivArea
913,5,284,1001,1001,2002
144,5,1332,1728,0,1728
1445,6,187,814,0,913
1382,7,0,596,764,1762
885,9,1238,1698,0,1709
