## Predicting the House Prices using Decision Tree Regressor


In [28]:

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path = 'train.csv'

home_data = pd.read_csv(iowa_file_path)
print(home_data.columns)



Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [29]:
#  target object is called y
y = home_data.SalePrice
print(y.head())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


In [30]:
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

## FITTING THE DECISION TREE MODEL

### We use data to decide how to break the houses into two groups, and then again to determine the predicted price in each group. This step of capturing patterns from data is called fitting or training the model. The data used to fit the model is called the training data.

In [31]:
# Splitting into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specifying Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fitting Model
iowa_model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

## FINDING mean absolute error (MAE) 

### It is a measure of difference between two continuous variables. We use it to identify how well our model is trained. Means whether it is predicting correctly or not.

In [32]:
# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: $",val_mae)

Validation MAE: $ 29652.931506849316


# The Mean Absolute Error is Coming $29,653 in house price prediction which is very high. 
## Now I will try to reduce the Error by checking the MAE at various Depth of the Decision tree.

### Method to calculate the MAE for a given depth/Max Leave count. 

In [33]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

## Step 1: Compare Different Tree Sizes

In [34]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
#  finding the ideal tree size from candidate_max_leaf_nodes
min_err=0
size=0
for x in candidate_max_leaf_nodes:
    mae=get_mae(x,train_X, val_X, train_y, val_y)
    if min_err==0 or mae<min_err:
      min_err=mae
      size=x 
    

# Storing the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size =size


## Step 2: Fit Model Using All Data

As we know the best tree size. Now deploying this model in practice. We will make it even more accurate by using all of the data and keeping that tree size. That is, now we don't need to hold out the validation data

In [35]:
# Filling in argument to make optimal size 
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size,random_state=1)

# fitting the final model
final_model.fit(X, y)


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=100, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print("The mean_absolute_error in Predicted house price is $", mean_absolute_error(val_y, melb_preds))

The mean_absolute_error in Predicted house price is $ 22762.42931506849


The model is tuned and we can see improved results. 
