In [26]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns

In [27]:
# Suppress warning on the notebook
import warnings 
warnings.filterwarnings("ignore")

In [28]:
data= pd.read_csv("Melbourne_housing_FULL.csv")
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [29]:
data.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [30]:
col_to_use= ["Suburb","Rooms","Type", "Method","SellerG","Regionname","Propertycount",
             "Distance", "CouncilArea","Bedroom2","Bathroom", "Car",
             "Landsize", "BuildingArea","Price"]
data=data[col_to_use]
data.head()


Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [31]:
data.shape

(34857, 15)

In [32]:
data.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [33]:
# Fill some with zero
cols_fill_zero= ["Propertycount","Distance","Bedroom2","Bathroom","Car"]
data[cols_fill_zero]=data[cols_fill_zero].fillna(0)
data.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [34]:
# fill some with the mean value
data["Landsize"]=data["Landsize"].fillna(data.Landsize.mean())
data["BuildingArea"]=data["BuildingArea"].fillna(data.BuildingArea.mean())

In [35]:
data.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          3
Propertycount       0
Distance            0
CouncilArea         3
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

In [36]:
## Drop some
data.dropna(inplace=True)
data.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [37]:
# One oht encoding
data=pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,False,...,False,False,False,False,False,False,False,False,True,False
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,False,...,False,False,False,False,False,False,False,False,True,False
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,False,...,False,False,False,False,False,False,False,False,True,False
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,False,...,False,False,False,False,False,False,False,False,True,False
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,False,...,False,False,False,False,False,False,False,False,True,False


In [38]:
x= data.drop("Price", axis=1)
y=data["Price"]

In [39]:
x.shape, y.shape

((27244, 744), (27244,))

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(x, y, test_size=0.3, random_state=2)

In [42]:
from sklearn.linear_model import LinearRegression
reg= LinearRegression().fit(X_train, y_train)

In [43]:
reg.score(X_test, y_test)

0.13853683161570418

In [47]:
reg.score(X_train,y_train) # Overfitting , do not perform well previously on the test set

0.6827792395792722

In [48]:
reg.coef_

array([ 2.64001655e+05,  4.92118905e+00, -4.64732744e+04, -8.22349031e+04,
        1.17151153e+05,  4.29703140e+04,  2.35173742e+00,  4.70024530e+02,
        2.61342069e+05, -4.61008827e+04, -1.32312259e+05,  2.15453995e+05,
        1.20133839e+05,  2.72683980e+05,  1.74702166e+05, -8.00201053e+04,
       -1.50967673e+05, -4.94461444e+04,  1.17711126e+05, -1.28880253e+05,
       -3.49349656e+04, -8.54261329e+03,  6.19157896e+04, -2.57258028e+05,
       -1.10868116e+05, -2.42391674e+05,  1.79856957e+05, -1.22395100e+05,
        2.04625284e+05,  3.78258911e+04,  1.46681364e+05,  2.32198627e+03,
        7.35395732e+04, -5.29198411e+04,  1.90482393e+05, -3.20451670e+05,
        7.49515014e+04, -2.83019804e+04,  2.69103300e+04,  2.78824270e+05,
        1.51386730e+05, -1.06324212e+05, -8.45087700e+04,  2.96568248e+05,
        1.50644588e+05,  9.96515155e-08,  1.75999576e+05, -1.20057166e+05,
       -9.64544651e+03,  3.68377378e+05, -1.35478625e+05, -6.38092125e+04,
        3.53493139e+04,  

In [50]:
from sklearn import linear_model
lasso_reg= linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)

lasso_reg.fit(X_train, y_train)

0,1,2
,alpha,50
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,100
,tol,0.1
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [51]:
lasso_reg.score(X_train, y_train)

0.6766985624766823

In [52]:
lasso_reg.score(X_test, y_test) # improvement

0.6636111369404488

In [53]:
from sklearn.linear_model import Ridge

Ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)

Ridge_reg.fit(X_train, y_train)

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,100
,tol,0.1
,solver,'auto'
,positive,False
,random_state,


In [54]:
Ridge_reg.score(X_train, y_train)

0.6622376739684328

In [56]:
Ridge_reg.score(X_test, y_test) # improvement

0.6670848945194958