IMPORTING NECESSARY LIBRARIES:

In [107]:
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Skipping detailed steps as they are same as MLR in previous file

In [None]:
df = pd.read_csv('/content/house_prices_train - house_prices_train.csv')

In [None]:
label_encoder = preprocessing.LabelEncoder()
df['CentralAir']= label_encoder.fit_transform(df['CentralAir'])
df['RoofStyle']= label_encoder.fit_transform(df['RoofStyle'])

MODEL BUILDING WITH HYPERPARAMETER TUNING:

Correlation between inputs & outliers doesn't affect decision trees. Decison Trees don't require scaling too

In [None]:
X = df.drop(['Id', 'SalePrice'], axis=1) 
y= df['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 2)

CRITERION ==> MSE

In [None]:
dt_mse = DecisionTreeRegressor(criterion='squared_error', max_depth=4, random_state=3)

In [None]:
dt_mse.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=4, random_state=3)

In [None]:
dt_mse.score(X_test, y_test)

0.7786596381981745

Lets tune hyperparameter and check scores with different depths

In [119]:
pr_scores_mse = []
para = []

for g in range(2,15,2):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
  dt_mse = DecisionTreeRegressor(criterion='squared_error', max_depth=g, random_state=0)
  dt_mse.fit(X_train, y_train)  
  pr_scores_mse.append(dt_mse.score(X_test, y_test))
  para.append(g)

In [120]:
model_parameters = pd.DataFrame({'Score':pr_scores_mse,'Depth':para})

In [121]:
model_parameters

Unnamed: 0,Score,Depth
0,0.655664,2
1,0.727799,4
2,0.794617,6
3,0.789566,8
4,0.785499,10
5,0.756834,12
6,0.7804,14


Model works best with depth set to 6

*NOTE: I tried checking the scores with a nested loop on random states of train test split and decision tree but the average was the same for all for some reason. Hence I decided to keep it simple and use the usual random state of 42 for tts and 0 for tree*

K_FOLD CROSS VALIDATION CHECK WITH 5 FOLDS

In [122]:
cross_val_score(estimator= dt_mse,X=X, y=y, cv=5)

array([0.67772682, 0.70053383, 0.78338019, 0.69582256, 0.69401021])

METRICS

In [129]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
pr_dr = dummy_regr.predict(X_test)
print("MAE: ")
print("{0:.2f}".format(mean_absolute_error(y_true=y_test, y_pred= pr_dr)))
print("MSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= pr_dr)))
print("RMSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= pr_dr, squared=False)))

MAE: 
59592.08
MSE: 
7341536468.34
RMSE: 
85682.77


In [130]:
predictions = dt_mse.predict(X_test)
print("MAE: ")
print("{0:.2f}".format(mean_absolute_error(y_true=y_test, y_pred= predictions)))
print("MSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= predictions)))
print("RMSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= predictions, squared=False)))

MAE: 
26170.21
MSE: 
1612167813.88
RMSE: 
40151.81


The metrics of our model are less than the dummy model hence model is working well

CRITERION ==> MAE

In [126]:
pr_scores_mae = []
para = []

for h in range(2,19,2):
  dt_mae = DecisionTreeRegressor(criterion='absolute_error', max_depth=h, random_state=0)
  dt_mae.fit(X_train, y_train)  
  pr_scores_mae.append(dt_mae.score(X_test, y_test))
  para.append(h)

In [127]:
model_parameters_a = pd.DataFrame({'Score':pr_scores_mae,'Depth':para})

In [128]:
model_parameters_a

Unnamed: 0,Score,Depth
0,0.572084,2
1,0.720423,4
2,0.825392,6
3,0.797153,8
4,0.73071,10
5,0.737811,12
6,0.800613,14
7,0.738465,16
8,0.786641,18


Model works best with depth set to 14 with criterion set to MAE

K_FOLD CROSS VALIDATION CHECK WITH 5 FOLDS

In [108]:
cross_val_score(estimator= dt_mae,X=X, y=y, cv=5)

array([0.64307157, 0.6711742 , 0.78814638, 0.66622951, 0.68134642])

METRICS

In [131]:
print("MAE: ")
print("{0:.2f}".format(mean_absolute_error(y_true=y_test, y_pred= pr_dr)))
print("MSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= pr_dr)))
print("RMSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= pr_dr, squared=False)))

MAE: 
59592.08
MSE: 
7341536468.34
RMSE: 
85682.77


In [132]:
predictions_mae = dt_mae.predict(X_test)
print("MAE: ")
print("{0:.2f}".format(mean_absolute_error(y_true=y_test, y_pred= predictions_mae)))
print("MSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= predictions_mae)))
print("RMSE: ")
print("{0:.2f}".format(mean_squared_error(y_true=y_test, y_pred= predictions_mae, squared=False)))

MAE: 
27084.18
MSE: 
1566352333.92
RMSE: 
39577.17


The result shows the model with criterion MAE works slightly better