### First Model

#### import requirements

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

#### Reading the Data

In [7]:
df = pd.read_csv("/Users/nirmal/opt/DataBase/Melbourne_housing_FULL.csv")

In [8]:
df.head

<bound method NDFrame.head of            Suburb             Address  Rooms Type      Price Method  \
0      Abbotsford       68 Studley St      2    h        NaN     SS   
1      Abbotsford        85 Turner St      2    h  1480000.0      S   
2      Abbotsford     25 Bloomburg St      2    h  1035000.0      S   
3      Abbotsford  18/659 Victoria St      3    u        NaN     VB   
4      Abbotsford        5 Charles St      3    h  1465000.0     SP   
...           ...                 ...    ...  ...        ...    ...   
34852  Yarraville         13 Burns St      4    h  1480000.0     PI   
34853  Yarraville       29A Murray St      2    h   888000.0     SP   
34854  Yarraville      147A Severn St      2    t   705000.0      S   
34855  Yarraville    12/37 Stephen St      3    h  1140000.0     SP   
34856  Yarraville    3 Tarrengower St      2    h  1020000.0     PI   

             SellerG        Date  Distance  Postcode  ...  Bathroom  Car  \
0             Jellis   3/09/2016       2.

#### data scrubing

In [9]:
del df['Address']

In [11]:
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [12]:
df

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,,2.5,2.0,1.0,1.0,126.0,,,Yarra City Council
1,Abbotsford,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,,2.5,3.0,2.0,1.0,0.0,,,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,1480000.0,6.3,4.0,1.0,3.0,593.0,,,Maribyrnong City Council
34853,Yarraville,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council
34854,Yarraville,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council
34855,Yarraville,3,h,1140000.0,6.3,,,,,,,Maribyrnong City Council


In [13]:
# Remove rows with missing values
df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)

In [14]:
#use one-hot encoding
features_df = pd.get_dummies(df,columns=['Suburb','CouncilArea','Type'])

In [15]:
# Remove Price from features_df
del features_df['Price']

In [17]:
# Create X and y arrays
X = features_df.values
y = df['Price'].values

In [19]:
X.shape

(8895, 359)

In [20]:
y.shape

(8895,)

#### splitting the data

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)

In [22]:
X_train.shape

(6226, 359)

#### Use of GradientBoosting Algorithm

In [23]:
model = ensemble.GradientBoostingRegressor(
        n_estimators = 150,
        learning_rate = 0.1,
        max_depth = 30,
        min_samples_split = 4,
        min_samples_leaf = 6,
        max_features = 0.6,
        loss = 'huber')

In [24]:
model.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=30, max_features=0.6, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=6, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=150,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
joblib.dump(model,'house_trained_model.pkl')

['house_trained_model.pkl']

In [26]:
mse = mean_absolute_error(y_train,model.predict(X_train))
print("Training Set Mean Absolute Error:%.2f"%mse)

Training Set Mean Absolute Error:29468.39


In [27]:
mst = mean_absolute_error(y_test,model.predict(X_test))
print("Test Set Mean Absolute Error:%.2f"%mst)

Test Set Mean Absolute Error:163412.87


In [28]:
model_2 = ensemble.GradientBoostingRegressor(
        n_estimators = 250,
        learning_rate = 0.1,
        max_depth = 5,
        min_samples_split = 4,
        max_features = 0.6,
        loss = 'huber')

In [29]:
model_2.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='huber',
                          max_depth=5, max_features=0.6, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=250,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
joblib.dump(model_2,'housing_price_model2.pkl')

['housing_price_model2.pkl']

In [31]:
mse = mean_absolute_error(y_train,model_2.predict(X_train))
print("Training Set Mean Absolute Error:%.2f"%mse)

Training Set Mean Absolute Error:117885.26


In [33]:
msf = mean_absolute_error(y_test,model_2.predict(X_test))
print("Test Set Mean Absolute Error:%.2f"%msf)

Test Set Mean Absolute Error:161469.99
