In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [4]:
# Reading in data
ames_houses = pd.read_csv('../datasets/train.csv')

ames_house = pd.read_csv('../datasets/test.csv')

In [5]:
# Selecting features

features = ['Garage Cars',
            'Total Bsmt SF',
            '1st Flr SF',
            'Lot Area',
            'Overall Qual',
            'Gr Liv Area',
            'Full Bath',
           'Fireplaces',
            'Year Built',
            'Year Remod/Add',
            'Mas Vnr Area',
            'TotRms AbvGrd',
           'Garage Area']

ames_train = ames_houses[features].copy()

ames_test = ames_house[features].copy()

In [6]:
# dropping null values

ames_train['Garage Area'] = ames_train['Garage Area'].fillna(0)

ames_test['Garage Area'] = ames_test['Garage Area'].fillna(0)

In [7]:
ames_train['Garage Cars'] = ames_train['Garage Cars'].fillna(0)

ames_test['Garage Cars'] = ames_test['Garage Cars'].fillna(0)

In [8]:
ames_train['Total Bsmt SF'] = ames_train['Total Bsmt SF'].fillna(0)

ames_test['Total Bsmt SF'] = ames_test['Total Bsmt SF'].fillna(0)

In [9]:
ames_train['Mas Vnr Area'] = ames_train['Mas Vnr Area'].fillna(0)

ames_test['Mas Vnr Area'] = ames_test['Mas Vnr Area'].fillna(0)

In [10]:
# Feature engineering:

ames_train['garage_space'] = ames_train['Garage Cars'] * ames_train['Garage Area']

ames_test['garage_space'] = ames_test['Garage Cars'] * ames_test['Garage Area']

In [11]:
# More feature engineering:

ames_train['lower_sq_ftg'] = ames_train['Total Bsmt SF'] * ames_train['1st Flr SF']

ames_test['lower_sq_ftg'] = ames_test['Total Bsmt SF'] * ames_test['1st Flr SF']

In [12]:
# More feature engineering:

ames_train['tot_sq_ftg'] = ames_train['Gr Liv Area'] * ames_train['Total Bsmt SF']

ames_test['tot_sq_ftg'] = ames_test['Gr Liv Area'] * ames_test['Total Bsmt SF']

In [13]:
ames_train['overall_living'] = ames_train['Gr Liv Area'] * ames_train['Overall Qual']

ames_test['overall_living'] = ames_test['Gr Liv Area'] * ames_test['Overall Qual']

In [14]:
# Defining y:

y = ames_houses['SalePrice']

In [15]:
# Train-test splitting to start Ridge & LASSO process:

X_train, X_test, y_train, y_test = train_test_split(ames_train, y, random_state = 142)

In [16]:
# Standardizing numeric data:

sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [17]:
Z_train

array([[-0.99843111,  0.2095    ,  0.99497849, ...,  0.3103756 ,
         0.06912072, -0.35325294],
       [ 0.29633031, -0.60245178, -0.90514407, ..., -0.5514676 ,
        -0.29073607,  0.01739085],
       [ 0.29633031, -0.67925803, -1.03730483, ..., -0.60106059,
        -0.38885265, -0.12681461],
       ...,
       [ 0.29633031, -1.46926517, -0.71313693, ..., -0.76153986,
        -0.90657115, -1.20736245],
       [ 1.59109173,  0.74275481,  1.03487608, ...,  0.60563686,
         0.337877  ,  0.28871959],
       [ 0.29633031,  0.69886553,  0.9825105 , ...,  0.56123895,
         0.29746459,  0.25952097]])

In [18]:
# importing Ridge

from sklearn.linear_model import Ridge

In [19]:
# Instantiating Ridge

ridge_model = Ridge(alpha = 100)
# Fitting:

ridge_model.fit(Z_train, y_train)

Ridge(alpha=100)

In [20]:
# Printing Ridge R2 scores:

print('Training score:', ridge_model.score(Z_train, y_train))
print('Test score:', ridge_model.score(Z_test, y_test)) 

Training score: 0.8346097153123837
Test score: 0.8652174599394566


In [21]:
# Importing LASSO:

from sklearn.linear_model import LassoCV

In [22]:
# List of LASSO alphas to check through:

l_alphas = np.logspace(-3, 0, 100)

In [23]:
# Cross-validating over the list of alphas

lasso_cv = LassoCV(alphas = l_alphas, cv = 5, max_iter = 50000)

In [24]:
# Fitting the LASSO model:

lasso_cv.fit(Z_train, y_train);

In [25]:
# Finding optimal LASSO alpha:

lasso_cv.alpha_

0.001

In [26]:
#Printing LASSO R2 scores:

print('Training score: ', lasso_cv.score(Z_train, y_train))
print('Test score: ', lasso_cv.score(Z_test, y_test))

Training score:  0.8605219502281526
Test score:  0.8777863421930043
