In [133]:
import numpy as np # to carryout basic math viz. sum, square, square_root etc.
import pandas as pd # we'll bw working with DataFrames and hence this library is required
import seaborn as sns # for visualization
import matplotlib.pyplot as plt # for visualization

from sklearn import linear_model , metrics
from sklearn.linear_model import LinearRegression, Ridge, Lasso #to carryout various regularization models
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split #to split train & test models into ratios viz. 80:20, 70:30 etc.
from sklearn.metrics import mean_squared_error, r2_score # to compute metrics of various models
from sklearn.preprocessing import PolynomialFeatures , MinMaxScaler

import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [134]:
df = pd.read_csv('train.csv') #read the data from csv file 'train.csv'
df.shape

(1460, 81)

In [192]:
# df.columns

In [136]:
threshold = 0.5 # 50% threshold for null values in a column
# retaining columns with <=50% missing values and storing the data_frame as df_cleaned
df_cleaned = df.dropna( thresh = int( threshold * len( df )), axis = 1 )
df_cleaned.shape

(1460, 77)

In [137]:
# replacing null / missing values of the columns in 'df_cleaned' DataFrame with 'mode' of the column values
for column in df_cleaned.columns:
    if df_cleaned[column].dtype == 'object': # object means non-numeric values (viz. True, False, A, B etc.)
        df_cleaned[column].fillna(df_cleaned[column].mode()[0], inplace = True) # [0] represent mode value, [1] for freq of mode
    else:
        df_cleaned[column].fillna(df_cleaned[column].median() , inplace = True) # if column values are numeric

In [138]:
# removing 'Id' column as it doesn't add any value to the regression analysis
df_cleaned.drop( columns = ['Id'] , inplace = True )

In [139]:
df_cleaned.shape

(1460, 76)

In [140]:
# Encoding categorical varibales with dummy variables with one-HOT coding
df_cleaned = pd.get_dummies( df_cleaned , drop_first = True)
df_cleaned.shape

(1460, 237)

In [141]:
pd.set_option('display.max_seq_items', None)
########## print(df_cleaned.columns) # just inspecting the cleaned columns

In [142]:
## STEP-5 REMOVING DUPLICATES
# removing duplicate rows
df_cleaned.drop_duplicates( inplace = True )

In [143]:
df_cleaned.shape

(1460, 237)

In [144]:
### pd.DataFrame.drop_duplicates? # just in case if a refresher is needed on "drop_duplicates" method in pandas dataframe

# Linear Regression Modelling

In [237]:
X = df_cleaned.drop(columns= ['SalePrice'] ) # storing all the 236 predictors in 'X'
y = df_cleaned['SalePrice'].values.reshape(-1, 1) # storing the responsible variable 'SalePrice' in 'y' and reshaping into
# 2D array
cols = X.columns

In [233]:
# scaling the predictors & response variables
# instantiate the 'MinMaxScaler'
scaler = MinMaxScaler()
# fitting the scaler on predictor (X) and response (y) variables
X = scaler.fit_transform( X )
y = scaler.fit_transform( y )

In [190]:
# splitting both predictors & response varibales into train & test tests
X_train , X_test , y_train , y_test = train_test_split( X , y , train_size = 0.7 , test_size = 0.3 , random_state = 100 )

numpy.ndarray

In [148]:
# initialize the Linear Regression object
lm = LinearRegression()

# taking partial derivative of RSS w.r.t. beta_0, beta_1 , ... , beta_236 and solving the resultant normal equations
lm.fit( X_train , y_train )

In [149]:
# print the intercept term (beta_0) and model coefficients (beta_1 ... beta_236)
# lm.intercept_
lm.coef_.shape

(1, 236)

In [150]:
# predicting the responses based on the training set of predictors 'y_pred_train'
y_pred_train = lm.predict( X_train )

# predicting the responses based on the testing set of predictors 'y_pred_test'
y_pred_test = lm.predict( X_test )

# initiating a blank pandas series
metric =[ ]

#computing R^2 value of the model on training set
r2_train_lr = r2_score( y_train , y_pred_train )
print("R_sqaure_train_LR ", r2_train_lr)

# appending R^2 value of the model on training set to the pandas series named 'metric'
metric.append( r2_train_lr )

# compute R^2 value of the model on test set
r2_test_lr = r2_score( y_test , y_pred_test )
print("R_sqaure_test_LR ", r2_test_lr)

# appending R^2 value of the model on test set to the pandas series named 'metric'
metric.append(r2_test_lr)

# compute RSS value of the model on train test
rss_train_lr = np.sum( np.square( y_train - y_pred_train ) )
print( rss_train_lr )

# appending RSS value of the model on training data set to the pandas series named 'metric'
metric.append( rss_train_lr)

# computing RSS value of the model on test set
rss_test_lr = np.sum( np.square( y_test - y_pred_test ) )
print( rss_test_lr )

# appending RSS value of the model on test data set to the pandas series named 'metric'
metric.append( rss_test_lr )

# compute mean_squre_error of train set
mse_train_lr = mean_squared_error( y_train , y_pred_train )
print( mse_train_lr )

# append mse_train_lr to 'metric'
metric.append( mse_train_lr)

# compute mean_squared_error of test set
mse_test_lr = mean_squared_error( y_test , y_pred_test )
print( mse_test_lr**0.5 )

# append mse_test_lr to 'metric'
metric.append( mse_test_lr**0.5 )

R_sqaure_train_LR  0.9484056979385009
R_sqaure_test_LR  -3.5487209322400804e+18
0.6348693551077066
1.929021252877183e+19
0.0006218113174414364
209860861.32463437


# RIDGE regression

In [151]:
# list of hyper_parameters (aka lambdas or alphas)
params = {'alpha': [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10,20,50,100,500,1000]}

In [152]:
# initialize RIDGE object into variable 'ridge'
ridge = Ridge()

# cross validation to find best "lambda" using GridSearchCV() function
folds = 5
model_cv = GridSearchCV( estimator = ridge , param_grid = params , scoring = 'neg_mean_absolute_error' , cv = folds , 
                       return_train_score = True , verbose = 1)
model_cv.fit( X_train , y_train )

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [153]:
# from all of these fits I got the best value of 'lambda' as '20' using the function 'best_params_'
# that means this value gave highest negative_mean_absolute_error
print( model_cv.best_params_)

{'alpha': 6}


In [168]:
# assiging '6' to variable 'alpha'
alpha = 6

# once we get this value, I fit the RIDGE reg model again on the trining set
ridge = Ridge(alpha = alpha)
ridge.fit(X_train,y_train)
ridge.coef_.shape

(1, 236)

In [169]:
# but what we're actually interested in is comparing R^2 values on the Training & Testing sets
y_pred_train = ridge.predict( X_train)
y_pred_test = ridge.predict( X_test)
metric2 =[]

r2_train_lr = r2_score(y_train , y_pred_train)
print( r2_train_lr)
metric2.append( r2_train_lr )

r2_test_lr = r2_score(y_test , y_pred_test)
print( r2_test_lr)
metric2.append( r2_test_lr )

rss_train_lr = np.sum( np.square( y_train - y_pred_train ) )
print( rss_train_lr )
metric2.append( rss_train_lr )

rss_test_lr = np.sum( np.square( y_test - y_pred_test ) )
print(rss_test_lr)
metric2.append( rss_test_lr )

mse_train_lr = mean_squared_error( y_train , y_pred_train )
print( mse_train_lr )
metric2.append( mse_train_lr**0.5 )

mse_test_lr = mean_squared_error( y_test , y_pred_test )
print( mse_test_lr )
metric2.append( mse_test_lr**0.5)

0.8911471184965996
0.8657193372160505
1.3394378045720112
0.7299256754944763
0.001311888153351627
0.001666496976014786


In [170]:
# notice the difference between R^2 value of  the training data & the testing data has reduced quite a bit.
# so RIDGE regression did help us with the overfitting

# LASSO regression

In [171]:
# let's see how LASSO regression performs here. The code remains the same except for the estimator RIDGE to LASSO.
# initializaing LASSO variable object
lasso = Lasso()

In [172]:
# 'GridSearchCV' in onrder to find best 'lambda'
model_cv = GridSearchCV( estimator = lasso , param_grid = params , 
                       scoring = 'neg_mean_absolute_error' , cv = folds , 
                       return_train_score = True , verbose = 1 )
model_cv.fit( X_train , y_train )

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [173]:
# finding the best hyperparameter 'alpha'
print( model_cv.best_params_)

{'alpha': 0.0001}


In [174]:
# the best paramter after GridSearchCV, for LASSO, the optimum vallue of lambda = 20
alpha = 0.0001

# let's fit the LASSO model
lasso= Lasso(alpha=alpha)
lasso.fit(X_train , y_train)

In [175]:
# get model coefficients
###### ***NOTICE THE NO OF ZEROES***
lasso.coef_.shape

(236,)

In [176]:
# now we are more interested in comparing the R^2 value of training and test data sets to see if the overfitting issue 
# has been solved
y_pred_train = lasso.predict( X_train )
y_pred_test = lasso.predict( X_test )

metric3=[]
r2_train_lr = r2_score( y_train , y_pred_train )
print( r2_train_lr )
metric3.append( r2_train_lr )

r2_test_lr = r2_score( y_test , y_pred_test )
print( r2_test_lr )
metric3.append( r2_test_lr )

rss_train_lr = np.sum(np.square( y_train - y_pred_train ) )
print( rss_train_lr )
metric3.append( rss_train_lr )

rss_test_lr = np.sum( np.square( y_test - y_pred_test ) )
print( rss_test_lr )
metric3.append( rss_test_lr )

mse_train_lr = mean_squared_error( y_train , y_pred_train )
print( mse_train_lr )
metric3.append( mse_train_lr**0.5 )

mse_test_lr = mean_squared_error( y_test , y_pred_test )
print( mse_test_lr )
metric3.append( mse_test_lr**0.5 )

0.904873424926977
0.8576028712258623
23451.08464745864
4208.712439874329
0.0011464595625180077
0.0017672267888422524


## here also it does better than the LINEAR regression. In my view the R^2 value for the RIDGE regression of the test data set better than LASSO. 
## This sort of tells us that the variables that we considered in the model to build RIDGE regression, they all sort of related to the response variable. There were less noisy variables in the model. And LASSO regression does # feature selection & dropped a few variables, wasn't that a great idea.

In [177]:
# now let me put all the metrics viz R^2 value, RSS, MSE & RMSE in a single data frame for all the 3 models
lr_table = {'Metric': ['R2 score(Train)','R2 score(Test)', 'RSS(Train)', 'RSS(Test)' , 'MSE(Train)' , 'MSE(Test)'] ,
           'Linear Regression' : metric}

lr_metric = pd.DataFrame(lr_table , columns = ['Metric' , 'Linear Regression'])
rg_metric = pd.Series(metric2 , name = 'Ridge Regression' )
ls_metric = pd.Series( metric3 , name = 'Lasso Regression')
final_metric = pd.concat([lr_metric , rg_metric , ls_metric] , axis=1)

final_metric

Unnamed: 0,Metric,Linear Regression,Ridge Regression,Lasso Regression
0,R2 score(Train),0.9484057,0.891147,0.904873
1,R2 score(Test),-3.548721e+18,0.865719,0.857603
2,RSS(Train),0.6348694,1.339438,23451.084647
3,RSS(Test),1.929021e+19,0.729926,4208.71244
4,MSE(Train),0.0006218113,0.03622,0.033859
5,MSE(Test),209860900.0,0.040823,0.042038


# let's observe the changes in the model coefficients after regularization

In [246]:
betas = pd.DataFrame(X)
betas= pd.DataFrame( index=X.columns)
betas.rows = cols
betas['Linear'] = lm.coef_.flatten()
betas['Ridge'] = ridge.coef_.flatten()
betas['Lasso'] = lasso.coef_.flatten()

In [247]:
betas

Unnamed: 0,Linear,Ridge,Lasso
MSSubClass,-1.076166e-02,-0.026616,-0.035726
LotFrontage,3.747782e-02,-0.002683,-0.000000
LotArea,2.113548e-01,0.023847,0.040855
OverallQual,7.674150e-02,0.079511,0.119170
OverallCond,6.328699e-02,0.028846,0.039518
...,...,...,...
SaleCondition_AdjLand,2.350224e-02,0.004453,0.000000
SaleCondition_Alloca,2.049720e-02,0.007765,0.000000
SaleCondition_Family,-6.151795e-04,-0.003696,-0.000000
SaleCondition_Normal,6.051779e-03,0.003744,0.002606


In [248]:
#######betas.loc[betas['Lasso']==0,betas.index].tolist()

len(betas.index[betas['Lasso']!=0].tolist())

116

# Significant Predictors predicting the house price
## betas.index[betas['Lasso']!=0].tolist()



# Optimal value of lambda (hyper tuning parameter)
## 0.0001

In [249]:
# Calculate the absolute value of the 'Lasso' coefficients
betas['Lasso_abs'] = betas['Lasso'].abs()

# Sort the DataFrame based on the absolute value of 'Lasso' coefficients
betas_sorted = betas.sort_values(by='Lasso_abs', ascending=False)

# Get the top 5 predictors
top_5_predictors = betas_sorted.head(5)

print(top_5_predictors)

                            Linear     Ridge     Lasso  Lasso_abs
GrLivArea            -1.405244e+10  0.070722  0.335491   0.335491
Condition2_PosN      -6.463950e-01 -0.044711 -0.314755   0.314755
OverallQual           7.674150e-02  0.079511  0.119170   0.119170
RoofMatl_WdShngl      1.112107e+00  0.054069  0.118307   0.118307
Neighborhood_NoRidge  4.028440e-02  0.063531  0.067002   0.067002
