In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#numeric data
data_numeric = data.select_dtypes(include=['float64', 'int64'])
data_numeric.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [6]:
data_numeric = data_numeric.drop(['Id','BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'YrSold'], axis=1)
data_numeric.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,250000


In [7]:
corr = data_numeric.corr()
corr

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SalePrice
MSSubClass,1.0,-0.386347,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.022936,-0.069836,-0.065649,...,-0.098672,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.084284
LotFrontage,-0.386347,1.0,0.426095,0.251646,-0.059213,0.123349,0.088866,0.193458,0.233633,0.0499,...,0.344997,0.088521,0.151972,0.0107,0.070029,0.041383,0.206167,0.003368,0.0112,0.351799
LotArea,-0.139781,0.426095,1.0,0.105806,-0.005636,0.014228,0.013788,0.10416,0.214103,0.11117,...,0.180403,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,0.263843
OverallQual,0.032628,0.251646,0.105806,1.0,-0.091932,0.572323,0.550684,0.411876,0.239666,-0.059119,...,0.562022,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,0.790982
OverallCond,-0.059316,-0.059213,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.128101,-0.046231,0.040229,...,-0.151521,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,-0.077856
YearBuilt,0.02785,0.123349,0.014228,0.572323,-0.375983,1.0,0.592855,0.315707,0.249503,-0.049107,...,0.478954,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,0.522897
YearRemodAdd,0.040581,0.088866,0.013788,0.550684,0.073741,0.592855,1.0,0.179618,0.128451,-0.067759,...,0.3716,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.507101
MasVnrArea,0.022936,0.193458,0.10416,0.411876,-0.128101,0.315707,0.179618,1.0,0.264736,-0.072319,...,0.373066,0.159718,0.125703,-0.110204,0.018796,0.061466,0.011723,-0.029815,-0.005965,0.477493
BsmtFinSF1,-0.069836,0.233633,0.214103,0.239666,-0.046231,0.249503,0.128451,0.264736,1.0,-0.050117,...,0.29697,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.38642
BsmtFinSF2,-0.065649,0.0499,0.11117,-0.059119,0.040229,-0.049107,-0.067759,-0.072319,-0.050117,1.0,...,-0.018227,0.067898,0.003093,0.036543,-0.029993,0.088871,0.041709,0.00494,-0.015211,-0.011378


In [8]:
data[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'YrSold']] = data[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'YrSold']].astype('object')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
# creating dummy variables for categorical variables

# subset all categorical variables
data_categorical = data.select_dtypes(include=['object'])
data_categorical.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageCars,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,YrSold,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,2,TA,TA,Y,,,,2008,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,2,TA,TA,Y,,,,2007,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,2,TA,TA,Y,,,,2008,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,3,TA,TA,Y,,,,2006,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,3,TA,TA,Y,,,,2008,WD,Normal


In [10]:
data_dummies = pd.get_dummies(data_categorical, drop_first=True)
data_dummies.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
# drop categorical variables 
data = data.drop(list(data_categorical.columns), axis=1)

# concat dummy variables with data
data = pd.concat([data, data_dummies], axis=1)

In [12]:
X = data.loc[:, ~data.columns.isin(['SalePrice', 'Id'])]
y = data['SalePrice']

In [13]:
# scaling the features
from sklearn.preprocessing import scale

cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=261)

# Linear Regression

In [14]:
# Instantiate
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510015,0.575425,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
2,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.322174,0.092907,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
3,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,-0.499274,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
4,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.360826,0.463568,-0.288653,...,-0.058621,-0.058621,-0.301962,-0.045376,0.390293,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


In [15]:
#rows with at least one numm value
sum([True for idx,row in data.iterrows() if any(row.isnull())])

339

In [16]:
X = X.fillna(X.interpolate())

In [17]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,test_size = 0.3, random_state=100)

In [18]:
# Instantiate
lm = LinearRegression()

# Fit
lm.fit(X_train, y_train)

# Print the coefficients and intercept
print(lm.intercept_)
print(lm.coef_)

223125110729299.5
[-2.08078653e+03  2.70244112e+02  7.43968905e+03  9.11249012e+03
  5.46524083e+03  1.34667001e+04  2.27034128e+03  3.74084888e+03
 -1.72171447e+16 -6.08960509e+15 -1.66799367e+16  1.65605890e+16
 -2.87497949e+15 -3.24637896e+15 -3.61600609e+14  3.90789768e+15
 -2.55137500e+03 -7.39750000e+02  2.35762500e+03  8.77625000e+02
  9.54250000e+02  1.53800000e+03 -4.98593750e+02  8.66625000e+02
  7.65250000e+02  7.06600000e+03  9.90625000e+01 -6.75781250e+02
  5.74818750e+03  2.65868750e+03  1.07782500e+04  6.82925000e+03
  2.19025000e+03 -5.49750000e+02 -1.75867188e+02  3.04187500e+02
  1.10962500e+03  4.44500000e+02 -9.49875000e+02  1.32475000e+03
 -1.94000000e+02  1.98265625e+03 -1.23168750e+03  2.50218750e+02
 -4.03750000e+02  1.06175000e+03 -2.43862500e+03  9.44187500e+02
  8.02500000e+01  2.54962500e+03 -2.40125000e+02  1.51000000e+02
  5.69600000e+03 -1.00800000e+03  9.55500000e+02  1.80885938e+03
  9.62500000e+01 -1.85018750e+03 -4.27437500e+02  1.62100000e+03
 -1.577

In [19]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

metric = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric.append(mse_test_lr**0.5)

0.9533813889585732
-1.1694369264941994e+22
297459315505.3203
3.296306518163566e+34
291341151.327444
7.525813968409968e+31


# Ridge Regression


In [43]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = Ridge()

folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [46]:
print(model_cv.best_params_)

{'alpha': 100}


In [60]:
alpha = 100
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
print(ridge.coef_)

[-2.76808925e+03  4.67642580e+02  4.95683283e+03  1.04256794e+04
  3.84495377e+03  4.69233605e+03  3.26540843e+03  3.74691686e+03
  5.56597900e+03  1.36076947e+03  4.92583623e+02  6.78315716e+03
  8.49070106e+03  7.54831252e+03 -1.44707182e+03  1.23831319e+04
 -1.28986532e+03  2.81425220e+03  1.73944785e+03  2.00258069e+03
  1.28850103e+03  1.72004910e+03 -5.63798887e+02  1.14453181e+03
  1.23837626e+03  3.74473966e+03  4.20990019e+01 -5.16117811e+02
  9.75551162e+02  3.49031053e+02  1.91230092e+03  2.14572999e+02
  1.46211332e+03  9.60251694e+01  4.49424219e+01 -4.79598390e+02
  4.84850178e+02  1.34752559e+03 -1.85612437e+02  1.83660045e+03
 -5.51360240e+02  2.10045598e+03 -1.29170132e+03 -2.59378977e+02
 -5.38421550e+02  8.59260979e+02 -7.02235475e+02  2.29839608e+02
 -7.96414651e+02  3.24927741e+02 -1.26773287e+02 -6.47805340e+02
  3.75710884e+03 -2.94423578e+03 -2.80900048e+02 -8.14469702e+02
 -1.06896638e+03 -2.16079447e+03 -2.13328433e+03  7.19341355e+02
 -9.23096960e+02  5.21242

In [61]:
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric2.append(mse_test_lr**0.5)

0.942633033269852
0.7364320830095437
366041335744.6959
742922190219.3322
358512571.73819387
1696169384.0624022


# Lasso Regression

In [62]:
lasso = Lasso()

# cross validation
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [63]:
# Printing the best hyperparameter alpha
print(model_cv.best_params_)

{'alpha': 500}


In [96]:
alpha = 500

lasso = Lasso(alpha=alpha)
        
lasso.fit(X_train, y_train)

In [97]:
lasso.coef_

array([-3.84654237e+03,  4.03405975e+02,  4.29339848e+03,  1.23074333e+04,
        5.10245933e+03,  9.04948355e+03,  3.13221723e+03,  2.91101747e+03,
        7.43616048e+03,  1.08202451e+02, -0.00000000e+00,  8.55735486e+03,
        0.00000000e+00,  0.00000000e+00, -2.24310098e+03,  3.04350213e+04,
       -2.19418346e+03,  0.00000000e+00,  1.13303014e+03,  1.01175839e+03,
        6.35900690e+02,  1.24800831e+03, -0.00000000e+00,  5.19572919e+02,
        5.12232649e+02,  2.92004292e+03,  0.00000000e+00, -2.16345964e+02,
        0.00000000e+00,  0.00000000e+00,  8.35168407e+02, -7.43232160e+02,
        1.00858714e+03, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -4.61349063e+01,  1.52285064e+03, -4.16848265e+02, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  7.03101406e+02,  0.00000000e+00, -0.00000000e+00,
        3.79492293e+03, -

In [83]:
#R2 score, RSS and RMSE

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print(rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print(rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric3.append(mse_test_lr**0.5)

0.9426563900728945
0.5762099698866037
365892303019.5636
1194542268118.1694
358366604.3286617
2727265452.324588


In [29]:
# Creating a table which contain all the metrics

lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'MSE (Train)','MSE (Test)'], 
        'Linear Regression': metric
        }

lr_metric = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )

rg_metric = pd.Series(metric2, name = 'Ridge Regression')
ls_metric = pd.Series(metric3, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric], axis = 1)

final_metric

Unnamed: 0,Metric,Linear Regression,Ridge Regression,Lasso Regression
0,R2 Score (Train),0.9533814,0.942633,0.9426564
1,R2 Score (Test),-1.169437e+22,0.7364321,0.57621
2,RSS (Train),297459300000.0,366041300000.0,365892300000.0
3,RSS (Test),3.296307e+34,742922200000.0,1194542000000.0
4,MSE (Train),17068.72,18934.43,18930.57
5,MSE (Test),8675145000000000.0,41184.58,52223.23


In [85]:
betas = pd.DataFrame(index=X.columns)
betas.rows = X.columns
betas['Linear'] = lm.coef_
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_
pd.set_option('display.max_rows', None)
betas = betas.sort_values(by=['Lasso'], ascending=False)
betas

Unnamed: 0,Linear,Ridge,Lasso
GrLivArea,3907898000000000.0,12383.13,30435.021254
OverallQual,9112.49,10425.68,12307.433295
YearBuilt,13466.7,4692.336,9049.483554
TotalBsmtSF,1.656059e+16,6783.157,8557.354863
BsmtFinSF1,-1.721714e+16,5565.979,7436.160483
GarageCars_3,3327.5,5243.325,6285.349038
FullBath_3,5910.5,6009.871,5321.61617
OverallCond,5465.241,3844.954,5102.45933
BsmtExposure_Gd,4065.875,4455.939,4594.81853
Neighborhood_NoRidge,4706.5,5212.426,4426.436411
