# Machine Learning Tutorial Python - 17: L1 and L2 Regularization | Lasso, Ridge Regression

In this python machine learning tutorial for beginners we will look into,
1) What is overfitting, underfitting
2) How to address overfitting using L1 and L2 regularization
3) Write code in python and sklearn for housing price prediction where we will see a model overfit when we use simple linear regression.
Then we will use Lasso regression (L1 regularization) and ridge regression (L2 regression) to address this overfitting issue

In [2]:
# a polynomial fit can be said to be an overfit
# dealing with overfitting
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns



In [3]:
# Supress warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [35]:
# reaad dataset
dataset = pd.read_csv('Melbourne_housing_ML17.csv')
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2.0,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2.0,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2.0,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3.0,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3.0,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [36]:
# df exploration
dataset.nunique()

Suburb             334
Address          22449
Rooms               11
Type                 3
Price             2443
Method               9
SellerG            325
Date                57
Distance           211
Postcode           205
Bedroom2            13
Bathroom            10
Car                 13
Landsize          1559
BuildingArea       684
YearBuilt          155
CouncilArea         33
Lattitude         8391
Longtitude        9102
Regionname           8
Propertycount      329
dtype: int64

In [27]:
dataset.shape


(22861, 21)

In [37]:
# discarding certain columns
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
'BuildingArea', 'Price']

dataset = dataset[cols_to_use]
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2.0,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2.0,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2.0,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3.0,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3.0,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [38]:
dataset.shape

(22861, 15)

In [39]:
# view NaN values
dataset.isna().sum()


Suburb               0
Rooms                1
Type                 1
Method               1
SellerG              1
Regionname           2
Propertycount        2
Distance             1
CouncilArea          2
Bedroom2          4397
Bathroom          4400
Car               4517
Landsize          5779
BuildingArea     13179
Price             5020
dtype: int64

In [40]:
# handling NA
# fill some columns with zero
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)
dataset.isna().sum()

Suburb               0
Rooms                1
Type                 1
Method               1
SellerG              1
Regionname           2
Propertycount        0
Distance             0
CouncilArea          2
Bedroom2             0
Bathroom             0
Car                  0
Landsize          5779
BuildingArea     13179
Price             5020
dtype: int64

In [45]:
# fill with mean
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())

dataset.isna().sum()

Suburb              0
Rooms               1
Type                1
Method              1
SellerG             1
Regionname          2
Propertycount       0
Distance            0
CouncilArea         2
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            5020
dtype: int64

In [47]:
dataset.loc[dataset['Rooms'] == np.NaN]

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price


In [48]:
dataset.isna().sum()

Suburb              0
Rooms               1
Type                1
Method              1
SellerG             1
Regionname          2
Propertycount       0
Distance            0
CouncilArea         2
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            5020
dtype: int64

In [49]:
# fill rooms with previous value
dataset['Rooms'] = dataset['Rooms'].fillna(method="ffill")
dataset.isna().sum()

Suburb              0
Rooms               0
Type                1
Method              1
SellerG             1
Regionname          2
Propertycount       0
Distance            0
CouncilArea         2
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            5020
dtype: int64

In [50]:
dataset['Type'] = dataset['Type'].fillna(method="ffill")
dataset['Method'] = dataset['Method'].fillna(method="ffill")
dataset['SellerG'] = dataset['SellerG'].fillna(method="bfill")
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             1
Regionname          2
Propertycount       0
Distance            0
CouncilArea         2
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            5020
dtype: int64

In [51]:
dataset['SellerG'] = dataset['SellerG'].fillna(method="ffill")
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          2
Propertycount       0
Distance            0
CouncilArea         2
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            5020
dtype: int64

In [52]:
# dropping Regionname and Council Area NAn Values
dataset.dropna(inplace= True)
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [53]:
# categorical features converted to into dummies
# one hot encoding
dataset = pd.get_dummies(dataset, drop_first= True)
dataset.head()


Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2.0,4019.0,2.5,2.0,1.0,1.0,202.0,154.556647,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2.0,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3.0,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
5,3.0,4019.0,2.5,3.0,2.0,1.0,94.0,154.556647,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4.0,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [54]:
# create x and y
x = dataset.drop('Price', axis= 1)
y = dataset['Price']

In [68]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size= 0.3, random_state= 2)

In [56]:
# fit a linear regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)

In [57]:
reg.score(test_x, test_y)

0.5487318622128186

In [58]:
reg.score(train_x, train_y)

0.6958017649542837

In [61]:
# the model performs well on the train et and poorly on test set
# overfitting
# handle this using L1 regulariztion(lasso regression)
from sklearn import linear_model

lasso_reg = linear_model.Lasso(alpha= 50, max_iter= 10, tol= 0.1)
lasso_reg.fit(train_x, train_y)

Lasso(alpha=50, max_iter=10, tol=0.1)

In [62]:
lasso_reg.score(test_x, test_y)

0.5520836405514771

In [63]:
lasso_reg.score(train_x, train_y)

0.6919558137047095

In [71]:
# L2 regularization
# ridge regression
from sklearn.linear_model import Ridge

ridge_reg = linear_model.Ridge(alpha= 50, max_iter= 10, tol= 0.1)
ridge_reg.fit(train_x, train_y)

Ridge(alpha=50, max_iter=10, tol=0.1)

In [73]:
ridge_reg.score(test_x, test_y)

0.5365031794003001

In [74]:
ridge_reg.score(train_x, train_y)

0.6714999425707806