In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Auto+MPG

In [2]:
# Problem Statement: Fuel consumption of cars based on various factors

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
pd.options.display.max_columns = 1000
ampg_df = pd.read_csv('auto-mpg.data', sep='\s+', header=None, na_values='?',
                 names=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'],)
#                  dtype={'horsepower':np.float64})
print(ampg_df.shape)
ampg_df.head()

(398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
# Check for NAN values in the entire dataframe

ampg_df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [6]:
# Remove NAN values from the dataframe

ampg_df.dropna(inplace=True)
print(ampg_df.shape)

(392, 9)


In [7]:
# Split the dataframe into features and labels

X = ampg_df.drop(['mpg', 'car_name'], axis=1).values
y = ampg_df.loc[:, 'mpg'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (392, 7) y shape:  (392,)
Sample X values:  [[8.000e+00 3.070e+02 1.300e+02 3.504e+03 1.200e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.500e+02 1.650e+02 3.693e+03 1.150e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.180e+02 1.500e+02 3.436e+03 1.100e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.040e+02 1.500e+02 3.433e+03 1.200e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.020e+02 1.400e+02 3.449e+03 1.050e+01 7.000e+01 1.000e+00]] 
 Sample y values:  [18. 15. 18. 16. 17.]


In [8]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (372, 7) 
 y_train shape:  (372,) 
 X_test shape:  (20, 7) 
 y_test shape:  (20,) 



In [9]:
# Model 1
# Sklearn DecisionTreeRegressor model with max_depth 10

from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(max_depth=10, random_state=2)
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, random_state=2)

In [10]:
# R^2 values for train and test sets

print("Train set R^2 score: ", dt_reg.score(X_train, y_train))
print("Test set R^2 score: ", dt_reg.score(X_test, y_test))

Train set R^2 score:  0.9926279782704994
Test set R^2 score:  0.7949484741280166


In [11]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, dt_reg.predict(X_train)))
print("Test set mse: ", mean_squared_error(y_test, dt_reg.predict(X_test)))

Train set mse:  0.4467669225672073
Test set mse:  12.645625000000004


In [12]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, dt_reg.predict(X_train)))
print("Test set mae: ", mean_absolute_error(y_test, dt_reg.predict(X_test)))

Train set mae:  0.30330211323569967
Test set mae:  2.2175000000000002


In [13]:
# The Decision Tree Regressor with max depth 10 achieves a good R^2 score, complex models will be developed to achieve higher R^2 score on test set

In [14]:
# Model 2
# Sklearn DecisionTreeRegressor model with RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {'max_depth': list(range(3, 10)), 'min_samples_split': list(range(1, 10)), 'max_leaf_nodes': list(range(2, 200))}
dt_reg_rnd_search_cv = RandomizedSearchCV(DecisionTreeRegressor(random_state=2), param_distributions, n_iter=1000, n_jobs=10, verbose=5, cv=3, random_state=2)
dt_reg_rnd_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


RandomizedSearchCV(cv=3, estimator=DecisionTreeRegressor(random_state=2),
                   n_iter=1000, n_jobs=10,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7, 8, 9],
                                        'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8,
                                                           9, 10, 11, 12, 13,
                                                           14, 15, 16, 17, 18,
                                                           19, 20, 21, 22, 23,
                                                           24, 25, 26, 27, 28,
                                                           29, 30, 31, ...],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9]},
                   random_state=2, verbose=5)

In [15]:
dt_reg_rnd_search_cv.best_estimator_

DecisionTreeRegressor(max_depth=7, max_leaf_nodes=28, min_samples_split=8,
                      random_state=2)

In [16]:
# R^2 values for train and test sets

print("Train set R^2 score: ", dt_reg_rnd_search_cv.best_estimator_.score(X_train, y_train))
print("Test set R^2 score: ", dt_reg_rnd_search_cv.best_estimator_.score(X_test, y_test))

Train set R^2 score:  0.9304965784375936
Test set R^2 score:  0.788845246505163


In [17]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, dt_reg_rnd_search_cv.best_estimator_.predict(X_train)))
print("Test set mse: ", mean_squared_error(y_test, dt_reg_rnd_search_cv.best_estimator_.predict(X_test)))

Train set mse:  4.212118588184275
Test set mse:  13.022013946534505


In [18]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, dt_reg_rnd_search_cv.best_estimator_.predict(X_train)))
print("Test set mae: ", mean_absolute_error(y_test, dt_reg_rnd_search_cv.best_estimator_.predict(X_test)))

Train set mae:  1.5752477753583372
Test set mae:  2.6275829367764842
