In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Auto+MPG

In [2]:
# Problem Statement: Fuel consumption of cars based on various factors

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
pd.options.display.max_columns = 1000
ampg_df = pd.read_csv('auto-mpg.data', sep='\s+', header=None, na_values='?',
                 names=['mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin','car_name'],)
#                  dtype={'horsepower':np.float64})
print(ampg_df.shape)
ampg_df.head()

(398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
# Check for NAN values in the entire dataframe

ampg_df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [6]:
# Remove NAN values from the dataframe

ampg_df.dropna(inplace=True)
print(ampg_df.shape)

(392, 9)


In [7]:
# Split the dataframe into features and labels

X = ampg_df.drop(['mpg', 'car_name'], axis=1).values
y = ampg_df.loc[:, 'mpg'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (392, 7) y shape:  (392,)
Sample X values:  [[8.000e+00 3.070e+02 1.300e+02 3.504e+03 1.200e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.500e+02 1.650e+02 3.693e+03 1.150e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.180e+02 1.500e+02 3.436e+03 1.100e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.040e+02 1.500e+02 3.433e+03 1.200e+01 7.000e+01 1.000e+00]
 [8.000e+00 3.020e+02 1.400e+02 3.449e+03 1.050e+01 7.000e+01 1.000e+00]] 
 Sample y values:  [18. 15. 18. 16. 17.]


In [8]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (372, 7) 
 y_train shape:  (372,) 
 X_test shape:  (20, 7) 
 y_test shape:  (20,) 



In [9]:
# Scale the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Model 1
# Sklearn LinearSVR model with default parameters

from sklearn.svm import LinearSVR
lin_svr = LinearSVR(random_state=2)
lin_svr.fit(X_train_scaled, y_train)

LinearSVR(random_state=2)

In [11]:
# R^2 values for train and test sets

print("Train set R^2 score: ", lin_svr.score(X_train_scaled, y_train))
print("Test set R^2 score: ", lin_svr.score(X_test_scaled, y_test))

Train set R^2 score:  0.8146017061930315
Test set R^2 score:  0.6265220371968854


In [12]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, lin_svr.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, lin_svr.predict(X_test_scaled)))

Train set mse:  11.235700085078577
Test set mse:  23.032563368100412


In [13]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, lin_svr.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, lin_svr.predict(X_test_scaled)))

Train set mae:  2.4086147401588267
Test set mae:  3.107470318936552


In [14]:
# LinearSVR with default hyperparameters got modest R^2 score, we will try to increase the R^2 score by using nonlinear kernels

In [15]:
# Model 2
# Sklearn SVR model with rbf kernel

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 1.0), "C": uniform(1, 10)}
rbf_rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=30, n_jobs=6, verbose=5, cv=3, random_state=2)
rbf_rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


RandomizedSearchCV(cv=3, estimator=SVR(), n_iter=30, n_jobs=6,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6d1aa89450>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6d1ac0c510>},
                   random_state=2, verbose=5)

In [16]:
rbf_rnd_search_cv.best_estimator_

SVR(C=10.645510800892552, gamma=0.03162460308853514)

In [17]:
# R^2 values for train and test sets

print("Train set R^2 score: ", rbf_rnd_search_cv.best_estimator_.score(X_train_scaled, y_train))
print("Test set R^2 score: ", rbf_rnd_search_cv.best_estimator_.score(X_test_scaled, y_test))

Train set R^2 score:  0.8863727134615561
Test set R^2 score:  0.7737639397310399


In [18]:
# Mean Squared Errors of train and test sets

print("Train set mse: ", mean_squared_error(y_train, rbf_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, rbf_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mse:  6.886158911238349
Test set mse:  13.952085298915401


In [19]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, rbf_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, rbf_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mae:  1.7920386067257676
Test set mae:  2.639389638280142


In [20]:
# Model 3
# Sklearn SVR model with polynomial kernel

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

poly_param_distributions = {"gamma": reciprocal(0.001, 1.0), "C": uniform(1, 10)}
poly_rnd_search_cv = RandomizedSearchCV(SVR(kernel='poly', degree=3, coef0=1), poly_param_distributions, n_iter=10, n_jobs=6, verbose=5, cv=3, random_state=2)
poly_rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=SVR(coef0=1, kernel='poly'), n_jobs=6,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6de5e2ffd0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6de5e2fc50>},
                   random_state=2, verbose=5)

In [21]:
poly_rnd_search_cv.best_estimator_

SVR(C=3.046486340378425, coef0=1, gamma=0.07207968815585897, kernel='poly')

In [22]:
# R^2 values for train and test sets

print("Train set R^2 score: ", poly_rnd_search_cv.best_estimator_.score(X_train_scaled, y_train))
print("Test set R^2 score: ", poly_rnd_search_cv.best_estimator_.score(X_test_scaled, y_test))

Train set R^2 score:  0.8963555161303869
Test set R^2 score:  0.8060757182871098


In [23]:
# Mean Squared Errors of train and test sets

print("Train set mse: ", mean_squared_error(y_train, poly_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, poly_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mse:  6.281170728810485
Test set mse:  11.959402567267755


In [24]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, poly_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, poly_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mae:  1.6783775364825753
Test set mae:  2.387413969160702
