<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Pipeline</a></span></li></ul></div>

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('Datasets/trainHousePrices.csv')
cat_cols = data.select_dtypes('object').columns
data.drop(columns=cat_cols, inplace=True)
data.dropna(axis = 0, how ='any',inplace=True) 
data.shape

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns=['SalePrice']), data['SalePrice'], test_size=0.2, random_state=33)
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=5)
rfe.fit(xtrain, ytrain)
feature_list=[]
for i, col in zip(range(xtrain.shape[1]), xtrain.columns):
    if rfe.ranking_[i] == 1:
        feature_list.append(col)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
class Metrics:
    def evaluate(self, model, features, target):
        ypred = model.predict(features)
        mae = mean_absolute_error(y_true=target, y_pred=ypred)
        mse = mean_squared_error(y_true=target, y_pred=ypred)
        r2 = r2_score(y_true=target, y_pred=ypred)*100
        print(f"MAE :: {mae: .4f}")
        print(f"MSE :: {mse: .4f}")
        print(f"R2 :: {r2: .4f}")
        return [np.round(mae, 4), np.round(mse, 4), np.round(r2, 4)]
evaluator = Metrics()

In [None]:
class OutlierTreatment:
    
    def __init__(self, dff):
        self.dff = dff
    
    def outlier(self):
        Q1 = self.dff.quantile(0.25)
        Q3 = self.dff.quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5*IQR
        upper_limit = Q3 + 1.5*IQR
        return lower_limit, upper_limit
        
    def countoutlier(self, dfx):
        self.lower_limit, self.upper_limit = self.outlier()
        Total_outlier = len(dfx[(self.dff <= self.lower_limit)|(self.dff >= self.upper_limit)])
        return Total_outlier
    
    def cleanoutlier(self, dfx, dfy):
        self.lower_limit, self.upper_limit = self.outlier()
        outliers = dfx[(self.dff <= self.lower_limit)|(self.dff >= self.upper_limit)]
        dfx.drop(outliers.index, inplace=True)
        dfy.drop(outliers.index, inplace=True)
        return dfx, dfy

In [None]:
print(xtrain[feature_list].shape)
print(ytrain.shape)
print("After removing outlier")
# checking total outlier in every feature column
for i in xtrain[feature_list].columns:
    d = OutlierTreatment(xtrain[i])
    if d.countoutlier(xtrain) < 5:
        d.cleanoutlier(xtrain, ytrain)
print(xtrain[feature_list].shape)
print(ytrain.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [None]:
xtrain1 = xtrain[feature_list].copy()
xtest1 = xtest[feature_list].copy()

In [None]:
scaler = StandardScaler()
# fit and transform are both applied on training data
xtrain[feature_list] = scaler.fit_transform(xtrain[feature_list])
# only transform is applied on test data as features used to fit training data is applied in test data to perform transform 
xtest[feature_list] = scaler.transform(xtest[feature_list])

In [None]:
Linearmodel1 = LinearRegression()
Linearmodel1.fit(xtrain[feature_list], ytrain)

In [None]:
evaluator = Metrics()
evaluator.evaluate(Linearmodel1, xtrain[feature_list], ytrain)

In [None]:
evaluator.evaluate(Linearmodel1, xtest[feature_list], ytest)

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('ScalerObject', StandardScaler()), ('ModelName', LinearRegression())], verbose=True)
pipeline.fit(xtrain1[feature_list], ytrain)

In [None]:
evaluator.evaluate(model=pipeline, features=xtrain1[feature_list], target=ytrain)

In [None]:
evaluator.evaluate(model=pipeline, features=xtest1[feature_list], target=ytest)

In [None]:
xtrain1[feature_list].head()

In [None]:
xtest1[feature_list].head()