In [1]:
import pandas as pd

# Load data
df_pathway = 'statistical_model/melb_data.csv'
df = pd.read_csv(df_pathway)

# Filter rows with missing price values
filtered_df = df.dropna(axis=0)

# Choose target and features
y = filtered_df.Price
df_feature = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_df[df_feature]

from sklearn.tree import DecisionTreeRegressor

# Define model
df_model = DecisionTreeRegressor(random_state=1)

# Fit model
df_model.fit(X,y)

In [2]:
# Predict
print(X.head())
print(df_model.predict(X.head()))

   Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude
1      2       1.0     156.0          79.0     1900.0   -37.8079    144.9934
2      3       2.0     134.0         150.0     1900.0   -37.8093    144.9944
4      4       1.0     120.0         142.0     2014.0   -37.8072    144.9941
6      3       2.0     245.0         210.0     1910.0   -37.8024    144.9993
7      2       1.0     256.0         107.0     1890.0   -37.8060    144.9954
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [3]:
from sklearn.metrics import mean_absolute_error

# Evaluate
predicted_home_prices = df_model.predict(X)
mean_absolute_error(y, predicted_home_prices)
# Returns an "In-sample" evaluation

434.71594577146544

In [4]:
from sklearn.model_selection import train_test_split

# Cross-validation

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define model
df_model = DecisionTreeRegressor()

# Fit model
df_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = df_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))


260973.54551323433


In [5]:
# Utility function to help compare MAE scores from different values for max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [6]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 500  		 Mean Absolute Error:  243495
Max leaf nodes: 5000  		 Mean Absolute Error:  254983


In [7]:
#explicit loop
loop_leaf_nodes = [5, 50, 500, 5000]
explicit_loop = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in loop_leaf_nodes}
print(explicit_loop)

#store best value
best_tree_size = min(explicit_loop, key=explicit_loop.get)
print(best_tree_size)

{5: 347380.33833344496, 50: 258171.21202406782, 500: 243495.96361790417, 5000: 254983.64299548094}
500


In [15]:
# Finaly fit the best model
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
final_model.fit(X, y)