    Note: Can also use decision trees and svm

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy

%matplotlib inline
# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

In [85]:
df = pd.read_csv('Melbourne_housing_FULL.csv')
df.drop(['Lattitude', 'Longtitude'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,Northern Metropolitan,4019.0


In [86]:
print(df.isnull().sum())

Suburb               0
Address              0
Rooms                0
Type                 0
Price             6734
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          7063
Bathroom          7069
Car               7434
Landsize         10526
BuildingArea     18534
YearBuilt        16822
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64


    Note: Get rid of feature with lots missing values

In [87]:
# Drop the features with too many missing values
df.drop(['Landsize', 'BuildingArea', 'YearBuilt'], axis = 1, inplace = True)

# For features will less than 10,000 null values, drop the null values
df = df.dropna()

> Goal: Want to predict house value
    
    Regression Problem  

    Note: better to keep all of the features and then do feature selection

In [88]:
from sklearn import preprocessing

# Create a temp df with only float objects
obj_col = df.select_dtypes(include=['object'])

# Make a temp dataframe to store the imputed data
temp = df.loc[:, ~df.columns.isin(obj_col)]

# Normalize the data
names = temp.columns
temp = pd.DataFrame(preprocessing.scale(temp), columns = names)

# Create a new dataframe with the normalized data
new_df = pd.concat([temp, df[list(obj_col)]], axis = 1)
new_df = new_df.dropna()
new_df.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Propertycount,Suburb,Address,Type,Method,SellerG,Date,CouncilArea,Regionname
1,-1.076199,-0.070754,-1.28914,-0.413821,-1.045918,-0.826,-1.707401,-0.788466,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Yarra City Council,Northern Metropolitan
2,-0.030859,0.588249,-1.28914,-0.413821,-0.012639,0.59674,-1.707401,-0.788466,Abbotsford,25 Bloomburg St,h,S,Biggin,4/02/2016,Yarra City Council,Northern Metropolitan
4,1.014481,0.795145,-1.28914,-0.413821,-0.012639,-0.826,0.31533,-0.788466,Abbotsford,5 Charles St,h,SP,Biggin,4/03/2017,Yarra City Council,Northern Metropolitan
5,-1.076199,-0.214815,-1.28914,-0.413821,-1.045918,-0.826,-1.707401,-0.788466,Abbotsford,40 Federation La,h,PI,Biggin,4/03/2017,Yarra City Council,Northern Metropolitan
6,-0.030859,1.218132,-1.28914,-0.413821,1.02064,0.59674,-1.707401,-0.788466,Abbotsford,55a Park St,h,VB,Nelson,4/06/2016,Yarra City Council,Northern Metropolitan


In [89]:
# Need to change the object types to int/float types for regression
## Want to remove columns with over 50 unique values
for i in obj_col:
    col = obj_col[i]
    print(i)
    print(col.nunique())

Suburb
332
Address
18093
Type
3
Method
5
SellerG
306
Date
69
CouncilArea
33
Regionname
8


In [90]:
# Drop these 3 columns with too many unique values
# new_df.drop(['Suburb', 'Address', 'SellerG', 'Date'], axis = 1, inplace = True)
new_df.drop(['Address'], axis = 1, inplace = True)

# Lasso Regression

In [91]:
# Set targeted variable
# Set features
Y = new_df['Price']
X = new_df.loc[:, ~new_df.columns.isin(['Price'])]
X = pd.get_dummies(X)
X = X.dropna(axis = 1)

# Create a training set and test set sizes
# want training set to be 10 times larger than test set
trainsize = int(new_df.shape[0] * 0.8)
testsize = int(new_df.shape[0] * 0.08)


# Randomly choose 80% of data for training set
X_train, Y_train = X.sample(trainsize), Y.sample(trainsize)

# Randomly choose 8% of data for test set
X_test, Y_test = X.sample(testsize), Y.sample(testsize)

In [93]:
X.shape

(11305, 652)

    Note: Take absolute value of coefficients

In [100]:
from sklearn import linear_model
# Small number of parameters.
lass = linear_model.Lasso(alpha = 0.00001)
lassfit = lass.fit(X_train, Y_train)
print('R² for the model with few features:')
print(lass.score(X_train, Y_train))

coef = lassfit.coef_ 
intercept = lassfit.intercept_
print('\n Coefficient estimates for the model with few features:')
print(coef)
print('\n intercept estimates for the model with few features:')
print(intercept)

R² for the model with few features:
0.0659414573524

 Coefficient estimates for the model with few features:
[ -4.10156963e-02   3.44337602e-02  -1.18176024e-02   3.79479961e-02
  -2.01139526e-02   1.67907369e-02  -1.28992014e-02   8.89017327e-02
   1.62094848e-01  -2.76273984e-01  -3.96936979e-01   1.20787756e-01
  -7.76077131e-02   4.03615539e-03   1.27216935e-01  -1.63291934e-01
   4.82699955e-02  -3.98664274e-02   2.02370938e-01  -1.16916452e-01
   9.06368464e-02   0.00000000e+00  -9.10357979e-01  -1.72978600e-01
   0.00000000e+00  -1.30861744e-01  -5.62134541e-04   3.78715418e-02
   2.20708872e-01   2.04320494e-01  -9.71839689e-01  -2.66722599e-01
  -3.16990366e-01   9.70551158e-02   8.67897948e-02   5.02241350e-01
   2.00238106e-01  -0.00000000e+00   4.26603386e-01   4.57197709e-01
   4.09074941e-01  -5.36909281e-02  -3.39138865e-01   1.77597014e-01
   2.79875484e-01   5.18573677e-02   1.34697941e-01   3.81448969e-02
   7.86691109e-01   1.88270442e-01   1.70184628e-02  -1.3264786

> Previous Result before removing features with too many missing values

    R² for the model with few features:
    0.00181066079691

     Coefficient estimates for the model with few features:
    [ -3.10698036e-03  -3.33257703e-03   3.19408044e-02  -7.72838474e-03
       1.19624481e-02  -2.76874246e-03  -2.92478643e-03   2.53077734e-02
      -3.39585477e-04  -1.02950841e-02  -1.41968399e+11  -1.41968399e+11
      -1.41968399e+11   3.51989559e+09   3.51989559e+09   3.51989559e+09
       3.51989559e+09   3.51989559e+09   3.51989559e+09   3.51989559e+09
       3.51989559e+09   3.51989559e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   3.08726369e+09
       3.08726369e+09   3.08726369e+09   3.08726369e+09   5.42050994e+09
       5.42050994e+09   5.42050994e+09   5.42050994e+09   5.42050994e+09
       5.42050994e+09   5.42050994e+09   5.42050994e+09]

     intercept estimates for the model with few features:
    129940730178.0


In [101]:
# Run the model with test set
print('R² for the model with test set:')
print(lass.score(X_test, Y_test))

R² for the model with test set:
-0.0567118251643


In [114]:
# List the 25 most important features
dat = {'Columns': list(X),
       'Coef': abs(coef)}
temp_df = pd.DataFrame(dat)
temp_df = temp_df.nlargest(25, 'Coef')

# Random Forest

In [118]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfr = ensemble.RandomForestRegressor()

# Set targeted variable
# Set features
Y = new_df['Price']
X = new_df.loc[:, ~new_df.columns.isin(['Price'])]
X = pd.get_dummies(X)
X = X.dropna(axis = 1)

cross_val_score(rfr, X, Y, cv = 10)

array([ 0.15714041,  0.63335974,  0.41595801,  0.54266833,  0.58117662,
        0.67404827,  0.54351292,  0.6168868 ,  0.66219738,  0.67014519])

In [None]:
# Create a training set and test set sizes
# want training set to be 10 times larger than test set
trainsize = int(new_df.shape[0] * 0.8)
testsize = int(new_df.shape[0] * 0.08)


# Randomly choose 80% of data for training set
X_train, Y_train = X.sample(trainsize), Y.sample(trainsize)

# Randomly choose 8% of data for test set
X_test, Y_test = X.sample(testsize), Y.sample(testsize)