In [5]:
import pandas as pd
melbourne_data=pd.read_csv('melb_data.csv')

In [6]:
#print a summary of the data in the Melbourne_data dataframe
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


Interpreting Data Description

In [14]:
'''
The results show 8 numbers for each column in the datasset/dataframe. The first number is the count which shows how many 
rows have non-missing values.
The second value is the mean, which is the average. After the mean we have the std, standard deviation, which measures how 
numerically spread out the values are.
The min, 25% (pronounced "25th percentile") and 50% (pronounced "50th percentile") etc follow. They follow the order of the 
minimum value to the maximum value with the percentiles interpreting the size of the data interpreted in percentiles.
'''

'\nThe results show 8 numbers for each column in the datasset/dataframe. The first number is the count which shows how many \nrows have non-missing values.\nThe second value is the mean, which is the average. After the mean we have the std, standard deviation, which measures how \nnumerically spread out the values are.\nThe min, 25% (pronounced "25th percentile") and 50% (pronounced "50th percentile") etc follow. They follow the order of the \nminimum value to the maximum value with the percentiles interpreting the size of the data interpreted in percentiles.\n'

In [18]:
#Filter rows with missing values
filtered_melbourne_data=melbourne_data.dropna(axis=0)

In [21]:
#Choose target and features
#The y below signifies the target (price)... we are looking to predict the prices of houses
#The features (melbourne_features) are used to determine the target
y=filtered_melbourne_data.Price
melbourne_features=['Rooms', 'Bathroom', 'Landsize','BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X=filtered_melbourne_data[melbourne_features]

In [None]:
#When we have a dataset with many records, it might not be feasible to compare the predictions aginst the actauls one by
#one
#Therefore, a number of metrics are used to establish the predictive accuracy of a model, one of them is the Mean Absolute
#Error (MAE) which tells us on average how far off our predictions are from the actuals
#It calculates error (actual-prediction)
#It then gets the absolute value of the error- the conversion of the error to positive values in the case of negatives
#It will then get a mean of the absolutes we just calculated

In [22]:
#Scikit-learn provides the train_test_split function that is used to split the dataset into training and testing pieces
#This two pieces of data will be use to determine the quality of the model (i.e. the predictive accuracy)
#The two pieces are important in order to avoid in-sample scores when calculating Mean Absolute Error (M.A.E)
#In-sample scores happen when we use the same dataset to train and validate resulting in wrong scores
from sklearn.model_selection import train_test_split

In [23]:
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y=train_test_split(X, y, random_state=0)

In [None]:
#From the code below, the mean_absolute_error function from scikit-learn is used to calculate the accuracy of our model
#The DecisionTreeRegressor on the other hand provides the max_leaf_nodes function which helps with overfitting/underfitting
#Overfitting and underfitting refers to the performance of a model with regards to new data being used to test its accuracy
#With regards to overfitting and underfitting in decison trees, the depth of the trees determine them
#The optimum depth, hence optimum number of leaves can be determined using the max_leaf_nodes function 

In [25]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes,train_X, val_X, train_y, val_y):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val=model.predict(val_X)
    mae=mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
#The code below is used to determine the model's accuracy given different tree deoths determined by max_leaf_nodes function

In [26]:
#Compare MAE with differening values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae=get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes:%d \t\t Mean Absolute Error:%d" %(max_leaf_nodes, my_mae))

Max leaf nodes:5 		 Mean Absolute Error:347380
Max leaf nodes:50 		 Mean Absolute Error:257829
Max leaf nodes:500 		 Mean Absolute Error:243176
Max leaf nodes:5000 		 Mean Absolute Error:254915


In [28]:
#From the above we find that the optimal number of leaves given the option of 5, 50,500 and 5000 nodes is 500
#This is because it gives us the lowest Mean Absolute Error (MAE) compared to the others
#Models can suffer from overfitting or underfitting 
#Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions
#Underfitting: failing to capture relevant patterns, again leading to less accurate predictions

Random Forests

In [29]:
#Despite the work arounds we have used above, decision trees require a lot of work in order to attain the sweet spot 
#between overfitting and underfitting which will determine the model's accuracy
#This is because, a deep tree with lots of leaves will overfit since each prediction is coming from historical data 
#from only the few houses at its leaf
#However, a shallow tree with few leaves will perform poorly becasue it fails to capture as many distinctions in the 
#raw data
#This may not be a problem unique to decision trees since other sophisticated models sufer from the same problem- a tension
#between under/over-fitting
#The Random Forest model strives to offer an alternative 

In [30]:
#Random forest uses many trees, and then makes a prediction by averaging the predictions of each component tree
#It generally has better predictive accuracy than a single decsion tree and works well with default parameters
#A difference of note, among many is the use of RandomForestRegressor class with Random Forest Algorithm/model as 
#opposed to the DecisionTreeRegressor class with the decisioon Tree model

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model=RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds=forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

202806.6128254788


In [None]:
#The value above is way better in terms of performance compared to the optimum value of the decsion tree which gave us an 
#error of about 250,000. 
#Generally, the random forest offers better performance compared to the descion tree and the parameters can be tuned to 
#offer the most optimum of performance.
#However, the beauty of the Random Forest Model is that it works best with its default parameters and is therefore 
#relatively easier comapred to other models e.g. the XGBoost model