In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [39]:
dataset = pd.read_csv('melbourne.csv')
dataset.head(15)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0
6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019.0
7,Abbotsford,16 Maugie St,4,h,,SN,Nelson,6/08/2016,2.5,3067.0,...,2.0,2.0,400.0,220.0,2006.0,Yarra City Council,-37.7965,144.9965,Northern Metropolitan,4019.0
8,Abbotsford,53 Turner St,2,h,,S,Biggin,6/08/2016,2.5,3067.0,...,1.0,2.0,201.0,,1900.0,Yarra City Council,-37.7995,144.9974,Northern Metropolitan,4019.0
9,Abbotsford,99 Turner St,2,h,,S,Collins,6/08/2016,2.5,3067.0,...,2.0,1.0,202.0,,1900.0,Yarra City Council,-37.7996,144.9989,Northern Metropolitan,4019.0


In [40]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
dataset = dataset[cols_to_use]
dataset.head(15)

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,94.0,,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,1.0,2.0,120.0,142.0,1600000.0
7,Abbotsford,4,h,SN,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,2.0,400.0,220.0,
8,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,4.0,1.0,2.0,201.0,,
9,Abbotsford,2,h,S,Collins,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,202.0,,


In [41]:
# Filling NA vals
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)

# other continuous features can be imputed with mean for faster results since our focus is on Reducing overfitting
# using Lasso and Ridge Regression
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())
#Drop NA values of Price, since it's our predictive variable and we can't impute it
dataset.dropna(inplace=True)
dataset.shape

(27244, 15)

In [42]:
dataset

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.000000,160.2564,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.000000,79.0000,1035000.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.000000,150.0000,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,94.000000,160.2564,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,1.0,2.0,120.000000,142.0000,1600000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,PI,Jas,Western Metropolitan,6543.0,6.3,Maribyrnong City Council,4.0,1.0,3.0,593.000000,160.2564,1480000.0
34853,Yarraville,2,h,SP,Sweeney,Western Metropolitan,6543.0,6.3,Maribyrnong City Council,2.0,2.0,1.0,98.000000,104.0000,888000.0
34854,Yarraville,2,t,S,Jas,Western Metropolitan,6543.0,6.3,Maribyrnong City Council,2.0,1.0,2.0,220.000000,120.0000,705000.0
34855,Yarraville,3,h,SP,hockingstuart,Western Metropolitan,6543.0,6.3,Maribyrnong City Council,0.0,0.0,0.0,593.598993,160.2564,1140000.0


In [43]:
'''from sklearn.preprocessing import LabelEncoder
le_suburb = LabelEncoder()
le_type = LabelEncoder()
le_method = LabelEncoder()
le_sellerg = LabelEncoder()
le_region = LabelEncoder()
le_council = LabelEncoder()
dataset['Suburb_new'] = le_suburb.fit_transform(dataset['Suburb'])
dataset['Type_new'] = le_type.fit_transform(dataset['Type'])
dataset['Method_new'] = le_method.fit_transform(dataset['Method'])
dataset['SellerG_new'] = le_sellerg.fit_transform(dataset['SellerG'])
dataset['Regionname_new'] = le_region.fit_transform(dataset['Regionname'])
dataset['CouncilArea_new'] = le_council.fit_transform(dataset['CouncilArea'])
dataset
'''

"from sklearn.preprocessing import LabelEncoder\nle_suburb = LabelEncoder()\nle_type = LabelEncoder()\nle_method = LabelEncoder()\nle_sellerg = LabelEncoder()\nle_region = LabelEncoder()\nle_council = LabelEncoder()\ndataset['Suburb_new'] = le_suburb.fit_transform(dataset['Suburb'])\ndataset['Type_new'] = le_type.fit_transform(dataset['Type'])\ndataset['Method_new'] = le_method.fit_transform(dataset['Method'])\ndataset['SellerG_new'] = le_sellerg.fit_transform(dataset['SellerG'])\ndataset['Regionname_new'] = le_region.fit_transform(dataset['Regionname'])\ndataset['CouncilArea_new'] = le_council.fit_transform(dataset['CouncilArea'])\ndataset\n"

In [44]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [45]:
X_1 = dataset.drop('Price', axis='columns')
X_1

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Suburb_Aberfeldie,Suburb_Airport West,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.000000,160.2564,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.000000,79.0000,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.000000,150.0000,0,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.0,2.5,3.0,2.0,1.0,94.000000,160.2564,0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.000000,142.0000,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,4,6543.0,6.3,4.0,1.0,3.0,593.000000,160.2564,0,0,...,0,0,0,0,0,0,0,0,0,0
34853,2,6543.0,6.3,2.0,2.0,1.0,98.000000,104.0000,0,0,...,0,0,0,0,0,0,0,0,0,0
34854,2,6543.0,6.3,2.0,1.0,2.0,220.000000,120.0000,0,0,...,0,0,0,0,0,0,0,0,0,0
34855,3,6543.0,6.3,0.0,0.0,0.0,593.598993,160.2564,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
y_1 = dataset['Price']
y_1

1        1480000.0
2        1035000.0
4        1465000.0
5         850000.0
6        1600000.0
           ...    
34852    1480000.0
34853     888000.0
34854     705000.0
34855    1140000.0
34856    1020000.0
Name: Price, Length: 27244, dtype: float64

In [74]:
from sklearn.model_selection import train_test_split
# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(X_1, y_1, test_size=0.30, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (19070, 744)
the shape of the training set (target) is: (19070,)

the shape of the cross validation set (input) is: (4087, 744)
the shape of the cross validation set (target) is: (4087,)

the shape of the test set (input) is: (4087, 744)
the shape of the test set (target) is: (4087,)


<b> Feature Scaling </b>

In [75]:
from sklearn.preprocessing import StandardScaler
# Initialize the class
scaler_linear = StandardScaler()

# Compute the mean and standard deviation of the training set then transform it
scaled_fields = scaler_linear.fit_transform(x_train[['Propertycount','Landsize','BuildingArea']])
out_df = pd.DataFrame(scaled_fields,columns =['Propertycount','Landsize','BuildingArea'])
out_df
#print(f"Computed mean of the training set: {scaler_linear.mean_.squeeze()}")
#print(f"Computed standard deviation of the training set: {scaler_linear.scale_.squeeze()}")

Unnamed: 0,Propertycount,Landsize,BuildingArea
0,-0.463343,-0.093213,-0.103841
1,0.052629,-0.130868,-0.160775
2,-0.480660,0.040075,-0.013946
3,3.122709,-0.132960,-0.238684
4,-1.186015,-0.016109,-0.166768
...,...,...,...
19065,-1.446221,-0.003258,0.001805
19066,-0.914930,-0.000689,0.001805
19067,-1.222426,-0.000689,0.001805
19068,-0.297717,-0.103075,0.060966


In [76]:
X_train_scaled = x_train.drop(['Propertycount','Landsize','BuildingArea'], axis='columns')
s = pd.Series(range(19070))
X_scale = X_train_scaled.set_index([s])
X_scale

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,3,8.4,3.0,2.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,11.4,2.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,13.8,3.0,1.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,11.2,2.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,21.5,3.0,1.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19065,3,13.6,3.0,2.0,2.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
19066,2,10.4,0.0,0.0,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19067,3,11.8,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19068,3,3.6,3.0,2.0,0.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [77]:
X_train_final = pd.concat([X_scale,out_df], axis='columns')
X_train_final

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Propertycount,Landsize,BuildingArea
0,3,8.4,3.0,2.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.463343,-0.093213,-0.103841
1,2,11.4,2.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.052629,-0.130868,-0.160775
2,3,13.8,3.0,1.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.480660,0.040075,-0.013946
3,2,11.2,2.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.122709,-0.132960,-0.238684
4,3,21.5,3.0,1.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.186015,-0.016109,-0.166768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19065,3,13.6,3.0,2.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.446221,-0.003258,0.001805
19066,2,10.4,0.0,0.0,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,-0.914930,-0.000689,0.001805
19067,3,11.8,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.222426,-0.000689,0.001805
19068,3,3.6,3.0,2.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.297717,-0.103075,0.060966


In [78]:
y_train

7073     1046000.0
2985      855000.0
10160    1200000.0
8624      340000.0
32996     811250.0
           ...    
14081     741000.0
22167     575000.0
6803     1355000.0
15633    1600000.0
277      1500000.0
Name: Price, Length: 19070, dtype: float64

In [79]:
y_train = y_train.set_axis(s)
y_train

0        1046000.0
1         855000.0
2        1200000.0
3         340000.0
4         811250.0
           ...    
19065     741000.0
19066     575000.0
19067    1355000.0
19068    1600000.0
19069    1500000.0
Name: Price, Length: 19070, dtype: float64

<b> Training the model </b>

In [80]:
from sklearn.linear_model import LinearRegression
# Initialize the class
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train_final, y_train )

<b> Evaluating on Cross Validation Set </b>

In [85]:
# Scale the cross validation set using the mean and standard deviation of the training set
#X_cv_scaled = scaler_linear.transform(x_cv)

scaled_cv = scaler_linear.fit_transform(x_cv[['Propertycount','Landsize','BuildingArea']])
out_cv = pd.DataFrame(scaled_cv,columns =['Propertycount','Landsize','BuildingArea'])
X_cv_scaled = x_cv.drop(['Propertycount','Landsize','BuildingArea'], axis='columns')
s = pd.Series(range(4087))
X_ = X_cv_scaled.set_index([s])
X_cv_final = pd.concat([X_,out_cv], axis='columns')
X_cv_final

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,...,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Propertycount,Landsize,BuildingArea
0,3,14.0,3.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.968908,-0.192128,0.071710
1,1,8.5,1.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.024403,0.008919,-1.797928
2,3,12.6,3.0,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.851205,-0.037251,-1.192072
3,4,11.2,3.0,2.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.661421,-0.009699,0.732817
4,3,8.8,3.0,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.030308,0.004495,-1.217569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4082,3,14.0,3.0,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.494913,0.008669,0.071710
4083,3,5.0,0.0,0.0,0.0,0,0,0,0,0,...,1,0,0,0,0,0,0,1.251265,0.008919,0.071710
4084,4,7.8,3.0,1.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.799960,-0.066890,0.066831
4085,3,10.5,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.748797,0.008919,0.071710


In [87]:
y_cv = y_cv.set_axis(s)
y_cv

0        665000.0
1        436000.0
2        710000.0
3       1500000.0
4        800000.0
          ...    
4082     605000.0
4083     670000.0
4084     931000.0
4085     600000.0
4086    2100000.0
Name: Price, Length: 4087, dtype: float64

In [88]:
linear_model.score(X_cv_final, y_cv)

-1.2290116621730204e+16