# Using Sklearn to create a decision tree model to predict housing prices. Also using maximum absolute error to determine maximum number of leaf nodes. Then a random forest model will be made and compared with the decision tree model

In [9]:

import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## Import data and read with pandas. Then set target variable we would like to predict as the housing price and use the list of features from the csv file

In [6]:
#path of file to read
path = '/home/holden/programming/python/data_analysis/train.csv'


#read housing price data using pandas
housing_price_data = pd.read_csv(path)

#target variable we would like to predict
y = housing_price_data.SalePrice


features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = housing_price_data[features]


# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)



# Specify Model
model = DecisionTreeRegressor(random_state=1)
# Fit Model
model.fit(train_X, train_y)


# Make validation predictions and calculate mean absolute error
val_predictions = model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))




Validation MAE: 29,653


## Now we will compare different tree sizes to see which tree size produces the least amount of error

In [7]:
#create a function to create a model and test with different number of leaf nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [8]:

#loop through different candidates for maximum number of leaf nodes and compare maximum absolute error
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  35044
Max leaf nodes: 25  		 Mean Absolute Error:  29016
Max leaf nodes: 50  		 Mean Absolute Error:  27405
Max leaf nodes: 100  		 Mean Absolute Error:  27282
Max leaf nodes: 250  		 Mean Absolute Error:  27893
Max leaf nodes: 500  		 Mean Absolute Error:  29454


## Based on above results, 100 maximum leaf nodes produces the smallest mean absolute error so it is the best choice for the model

## Now we will create a new random forest model and compare it with the decision tree model

In [10]:
# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit model
rf_model.fit(train_X, train_y)


val_predictions = rf_model.predict(val_X)
#val_mae = 
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


Validation MAE for Random Forest Model: 21857.15912981083


## As the random forest model has a much lower mean absolute error than the decision tree model, it would be the more optimal model to use.