In [25]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load the melbourne data about house prices
house_df = pd.read_csv('melb_data.csv')

In [26]:
# DataFrame.describe() describes the columns inside of the data frame bringing a few interesting statistics
# such as the standard deviation, the count, mean, min, max among others.
house_df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [27]:
# Shows the columns of the melbourne dataframe
house_df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [28]:
# drops the data with null values to clear the dataset
house_df = house_df.dropna(axis = 0)
# Here we select the features that we are going to use to predict the price which is our desired column
house_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [29]:
# y is the target we want to predict
y = house_df.Price
# x is the rest of the features we want to utilize for the prediction, called fitting
X = house_df[house_features]

In [30]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [31]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [32]:
# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Define the model. Specify a number for random_state to ensure same results each run
# random_state is specified on the train test, so we don't need to specify it inside 
# of the decisionTree
house_model = DecisionTreeRegressor(random_state=1)

# Fit model
house_model.fit(train_X, train_y)


val_predictions = house_model.predict(val_X)

# Function that does the above giving max leaf nodes to find the best MAE
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [33]:
# I just made this for a visual comparison
print('Making predictions for the following 5 houses:')
print(val_y[:5])
print('The predictions are')
print(val_predictions[:5])
predictions = house_model.predict(X)

Making predictions for the following 5 houses:
6048     620000.0
9186    2320000.0
3991     750000.0
5829    1120000.0
3616    6500000.0
Name: Price, dtype: float64
The predictions are
[ 503000. 1857000.  760000. 1395000. 4250000.]


In [34]:
# Calculating the MAE (mean absolute error), but with in-sample data (the same data used for the training).
mean_absolute_error(val_y, val_predictions)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 100, 250, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

# try directly on X and y data
best_tree_size = 250
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

final_model.fit(train_X, train_y)
final_predict = final_model.predict(val_X)
print("MAE:")
print(mean_absolute_error(val_y, final_predict))



Max leaf nodes: 5 		 Mean Absolute Error: 369673
Max leaf nodes: 50 		 Mean Absolute Error: 266644
Max leaf nodes: 100 		 Mean Absolute Error: 256533
Max leaf nodes: 250 		 Mean Absolute Error: 242217
Max leaf nodes: 500 		 Mean Absolute Error: 244780
Max leaf nodes: 5000 		 Mean Absolute Error: 258379
MAE:
242217.1817972604


In [35]:
# Defining the model
house_model_forest = RandomForestRegressor(random_state=1)

# Fit model forest
house_model_forest.fit(train_X, train_y)

# prediction forest model
melb_preds  = house_model_forest.predict(val_X)

# printing MAE
print(mean_absolute_error(val_y, melb_preds))



190414.59149025998
