# Q13

Tree Based Models - Q13 - 18/July

We are interested in understanding what impacts the mileage of cars. 
400 cars were measured and its data is available in the file 06_Car_mileage.csv. 

https://drive.google.com/drive/folders/1Jl8iDu7nGmrqCECbrLqmVafgwE5PYfiU

    1) Train a decision tree and identify the features that impact the mileage of cars. 
        Note that cylinders though numerical can only take specific values, and origin is categorical.
    2) How good would the prediction be if we use 300 cars for training and test it on the rest of the data?
    3) Are there outliers that influence the result? How can we minimize the impact of outliers?

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import LocalOutlierFactor # for outlier detection and removal

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("06_Car_mileage.csv")
df['hp'] = pd.to_numeric(df['hp'], errors='coerce')
df.dropna(inplace=True)
df.head(2)

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,origin,mpg
0,8,307.0,130.0,3504,12.0,1,18.0
1,8,350.0,165.0,3693,11.5,1,15.0


In [3]:
#df[df['hp'].isnull()]

In [4]:
df.shape

(392, 7)

In [5]:
df.describe()

Unnamed: 0,cylinders,displacement,hp,weight,acceleration,origin,mpg
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,5.471939,194.41199,104.469388,2977.584184,15.541327,1.576531,23.445918
std,1.705783,104.644004,38.49116,849.40256,2.758864,0.805518,7.805007
min,3.0,68.0,46.0,1613.0,8.0,1.0,9.0
25%,4.0,105.0,75.0,2225.25,13.775,1.0,17.0
50%,4.0,151.0,93.5,2803.5,15.5,1.0,22.75
75%,8.0,275.75,126.0,3614.75,17.025,2.0,29.0
max,8.0,455.0,230.0,5140.0,24.8,3.0,46.6


# 1. Train a decision tree and identify the features that impact the mileage of cars. 

In [6]:
cat_vars = ['cylinders', 'origin']
num_vars = ['displacement', 'hp', 'weight', 'acceleration']

df = pd.get_dummies(df, columns=cat_vars, prefix_sep='_')

df.head()

Unnamed: 0,displacement,hp,weight,acceleration,mpg,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,origin_1,origin_2,origin_3
0,307.0,130.0,3504,12.0,18.0,0,0,0,0,1,1,0,0
1,350.0,165.0,3693,11.5,15.0,0,0,0,0,1,1,0,0
2,318.0,150.0,3436,11.0,18.0,0,0,0,0,1,1,0,0
3,304.0,150.0,3433,12.0,16.0,0,0,0,0,1,1,0,0
4,302.0,140.0,3449,10.5,17.0,0,0,0,0,1,1,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   displacement  392 non-null    float64
 1   hp            392 non-null    float64
 2   weight        392 non-null    int64  
 3   acceleration  392 non-null    float64
 4   mpg           392 non-null    float64
 5   cylinders_3   392 non-null    uint8  
 6   cylinders_4   392 non-null    uint8  
 7   cylinders_5   392 non-null    uint8  
 8   cylinders_6   392 non-null    uint8  
 9   cylinders_8   392 non-null    uint8  
 10  origin_1      392 non-null    uint8  
 11  origin_2      392 non-null    uint8  
 12  origin_3      392 non-null    uint8  
dtypes: float64(4), int64(1), uint8(8)
memory usage: 21.4 KB


In [8]:
x_vars = ['cylinders_3', 'cylinders_4', 'cylinders_5', 'cylinders_6', 'cylinders_8', 
          'origin_1', 'origin_2', 'origin_3'] + num_vars
y_var = 'mpg'

In [9]:
tune_parm_space = {'min_samples_split':range(1, 20),
                   'max_depth':range(1, 20),
                   'min_samples_leaf':range(1, 20)
                  }

tree_reg_model = DecisionTreeRegressor(random_state=1)
tree_reg_model_cv_1 = GridSearchCV(tree_reg_model, tune_parm_space, cv=5)
tree_reg_model_cv_1.fit(df[x_vars], df[y_var])

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=1),
             param_grid={'max_depth': range(1, 20),
                         'min_samples_leaf': range(1, 20),
                         'min_samples_split': range(1, 20)})

In [10]:
tree_reg_model_cv_1.best_params_

{'max_depth': 4, 'min_samples_leaf': 16, 'min_samples_split': 2}

In [11]:
model_sq = tree_reg_model_cv_1.best_score_
rmse = mean_squared_error(df[y_var], tree_reg_model_cv_1.predict(df[x_vars]), squared=False)

print(f"Using Grid Search cv and doing 5 fold cross-validation, decision tree performs as below:")
print(f"Model R square is {np.round(model_sq * 100, 2)}")
print(f"Model RMSE is     {np.round(rmse, 2)}")

Using Grid Search cv and doing 5 fold cross-validation, decision tree performs as below:
Model R square is 39.51
Model RMSE is     3.45


# 2. How good would the prediction be if we use 300 cars for training and test it on the rest of the data?

In [12]:
x_train, x_test, y_train, y_test = train_test_split(df[x_vars], df[y_var])

In [13]:
tune_parm_space = {'min_samples_split':range(1, 20),
                   'max_depth':range(1, 20),
                   'min_samples_leaf':range(1, 20)
                  }

tree_reg_model = DecisionTreeRegressor(random_state=1)
tree_reg_model_cv_2 = GridSearchCV(tree_reg_model, tune_parm_space, cv=5)
tree_reg_model_cv_2.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=1),
             param_grid={'max_depth': range(1, 20),
                         'min_samples_leaf': range(1, 20),
                         'min_samples_split': range(1, 20)})

In [14]:
tree_reg_model_cv_2.best_params_

{'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 2}

In [21]:
model_sq = tree_reg_model_cv_2.best_score_
test_rmse = mean_squared_error(y_test, tree_reg_model_cv_2.predict(x_test), squared=False)

print(f"Using Grid Search cv, after diving data into train and test set and doing 5 fold cross-validation" +  
      "decision tree performs as below:")
print(f"Model R square is {np.round(model_sq * 100, 2)}")
print(f"Model Test RMSE is     {np.round(test_rmse, 2)}")

Using Grid Search cv, after diving data into train and test set and doing 5 fold cross-validationdecision tree performs as below:
Model R square is 74.09
Model Test RMSE is     4.44


# 3. Are there outliers that influence the result? How can we minimize the impact of outliers?

In [16]:
lof = LocalOutlierFactor(n_neighbors = 20)

x_train['lof'] = lof.fit_predict(x_train)
x_train['negative_outlier_factor'] = lof.negative_outlier_factor_
x_train[x_train['negative_outlier_factor'] <= -1.5]

Unnamed: 0,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,origin_1,origin_2,origin_3,displacement,hp,weight,acceleration,lof,negative_outlier_factor
345,0,1,0,0,0,0,0,1,81.0,60.0,1760,16.1,-1,-1.617454
95,0,0,0,0,1,1,0,0,455.0,225.0,4951,11.0,-1,-1.798082
103,0,0,0,0,1,1,0,0,400.0,150.0,4997,14.0,-1,-1.876071
104,0,0,0,0,1,1,0,0,400.0,167.0,4906,12.5,-1,-1.672449
245,0,1,0,0,0,1,0,0,98.0,66.0,1800,14.4,-1,-1.511546
198,0,1,0,0,0,0,0,1,91.0,53.0,1795,17.4,-1,-1.528974
53,0,1,0,0,0,0,0,1,71.0,65.0,1773,19.0,-1,-1.580362
90,0,0,0,0,1,1,0,0,429.0,198.0,4952,11.5,-1,-1.779741
343,0,1,0,0,0,0,0,1,79.0,58.0,1755,16.9,-1,-1.635253
13,0,0,0,0,1,1,0,0,455.0,225.0,3086,10.0,-1,-2.12371


In [17]:
index = x_train['negative_outlier_factor'] > -1.5
x_train_without_outlier = x_train[index][x_vars]
y_train_without_outlier = y_train[index]

In [18]:
tune_parm_space = {'min_samples_split':range(1, 20),
                   'max_depth':range(1, 20),
                   'min_samples_leaf':range(1, 20)
                  }

tree_reg_model = DecisionTreeRegressor(random_state=1)
tree_reg_model_cv_3 = GridSearchCV(tree_reg_model, tune_parm_space, cv=5)
tree_reg_model_cv_3.fit(x_train_without_outlier, y_train_without_outlier)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=1),
             param_grid={'max_depth': range(1, 20),
                         'min_samples_leaf': range(1, 20),
                         'min_samples_split': range(1, 20)})

In [19]:
tree_reg_model_cv_3.best_params_

{'max_depth': 4, 'min_samples_leaf': 18, 'min_samples_split': 2}

In [22]:
model_sq = tree_reg_model_cv_3.best_score_
test_rmse = mean_squared_error(y_test, tree_reg_model_cv_3.predict(x_test), squared=False)

print(f"Using Grid Search cv, after dividing data into training and test set, with 5 fold cross-validation, " +
      "also after removing outliers, decision tree performs as below:")
print(f"Model R square is {np.round(model_sq * 100, 2)}")
print(f"Model Test RMSE is     {np.round(test_rmse, 2)}")

Using Grid Search cv, after dividing data into training and test set, with 5 fold cross-validation, also after removing outliers, decision tree performs as below:
Model R square is 72.49
Model Test RMSE is     4.21
