In [18]:
# Import libraries
import pandas as pd


---
# Selecting Data for Modeling

In [19]:
melbourne_data = pd.read_csv('melb_data.csv')
melbourne_data.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [20]:
melbourne_data.shape

(13580, 21)

In [21]:
melbourne_data.dropna(axis=0,inplace=True)

### Selecting the prediction `Target` and the `Features`

In [22]:
# Target
y = melbourne_data['Price']

In [23]:
# Features
x = melbourne_data[['Rooms','Bathroom','Landsize','Lattitude','Longtitude']]

## Build Your `Model`

In [24]:
from sklearn.tree import DecisionTreeRegressor

Define your Model

In [25]:
melbourne_model = DecisionTreeRegressor(random_state=1)
melbourne_model

Fit de Model

In [26]:
melbourne_model.fit(x,y)

In [27]:
print('Making prediction for the following 5 houses')
print(x.head())
print('The prediction are')
print(melbourne_model.predict(x.head()))

Making prediction for the following 5 houses
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The prediction are
[1035000. 1465000. 1600000. 1876000. 1636000.]


---
# Model Validation

`Mean Absolute Error (MAE)`

<b>error = actual - predicted</b>

So, if a house cost $150,000 and you predicted it would \$100,000 the <b>error is \$50,000.</b>

In [28]:
from sklearn.metrics import mean_absolute_error

In [29]:
# Create the Predict
predicted_house_prices = melbourne_model.predict(x)

# Mean Absolute Error | mean_absolute_error(actual,predicted)
mean_absolute_error(y,predicted_house_prices)

np.float64(1115.7467183128902)

## Spliting the Data

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

In [32]:
# Define the Model
melbourne_model = DecisionTreeRegressor()

# Fit the model
melbourne_model.fit(x_train,y_train)

In [33]:
# Get predicted prices
predictions = melbourne_model.predict(x_test)

# Mean Absolute Error
print('MAE:',mean_absolute_error(y_test,predictions))

MAE: 274488.6010329245


---
# Underfitting and Overfitting

In [34]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [42]:
def get_mae(max_leaf_nodes, x_train,x_test,y_train,y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(x_train,y_train)
    predicts = model.predict(x_test)
    mae = mean_absolute_error(y_test,predicts)
    return mae

In [43]:
# Compare MAE With Differing values of max_leaf_nodes
for max_leaf_nodes in [5,50,500,5000]:
    my_mae = get_mae(max_leaf_nodes,x_train,x_test,y_train,y_test)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes,my_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 385696
Max leaf nodes: 50 		 Mean Absolute Error: 279794
Max leaf nodes: 500 		 Mean Absolute Error: 261718
Max leaf nodes: 5000 		 Mean Absolute Error: 271320


---
# Random Forests

In [44]:
from sklearn.ensemble import RandomForestRegressor

In [45]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(x_train,y_train)
melb_preds = forest_model.predict(x_test)
print(mean_absolute_error(y_test,melb_preds))

207190.6873773146
