In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [3]:
auto = pd.read_csv('Auto.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
auto['mpg_high']= np.where(auto.mpg >= np.median(auto['mpg']),1,0)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg_high
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,0
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,0
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,0
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,0
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,0


#### Logistic Regression

In [5]:
#Convert non-numbers to Nans
auto['horsepower']= pd.to_numeric(auto['horsepower'],errors = 'coerce')
#Drop NaNs
auto = auto.dropna()


In [6]:
auto_log = pd.get_dummies(auto,columns=['origin'])
auto_log = auto_log.drop(['name'], axis=1)

auto_log['const']=1

auto_log.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,mpg_high,origin_1,origin_2,origin_3,const
0,18.0,8,307.0,130.0,3504,12.0,70,0,1,0,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,0,1,0,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,0,1,0,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,0,1,0,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,0,1,0,0,1


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn import metrics 
from sklearn.metrics import classification_report, mean_squared_error
from pylab import rcParams
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

xvals = auto_log[['const','cylinders','displacement','horsepower',
              'weight','acceleration','year','origin_1','origin_2']].values
yvals = auto_log['mpg_high'].values


In [25]:
kf = KFold(n_splits=4, random_state=25, shuffle=True)
kf.get_n_splits(xvals)

MSE_vec_kf = np.zeros(4)
ones_error = np.zeros(4)
zeros_error= np.zeros(4)
k_ind = int(0)
for train_index, test_index in kf.split(xvals):
    # print("TRAIN:", train_index, "TEST:", test_index)
    # print('k index=', k_ind)
    X_train, X_test = xvals[train_index], xvals[test_index]
    y_train, y_test = yvals[train_index], yvals[test_index]
    LogReg = LogisticRegression(max_iter=300)
    LogReg.fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    MSE_vec_kf[k_ind] = ((y_test - y_pred) ** 2).mean()
    ones_error[k_ind] = 1-((np.sum((y_test==1) & (y_pred==1)))/(np.sum(y_test==1)))
    zeros_error[k_ind] = 1- ((np.sum((y_test==0) & (y_pred==0)))/(np.sum(y_test==0)))
    # print('MSE for test set', k_ind, ' is', MSE_vec_kf[k_ind])
    k_ind += 1

MSE_logreg_kf = MSE_vec_kf.mean()
print('Test MSE Log Reg and K-fold=', MSE_logreg_kf)
print('Average Error Rate for mpg high = 1: ',ones_error.mean())
print('Average Error Rate for mpg high = 0: ',zeros_error.mean())

Test MSE Log Reg and K-fold= 0.09948979591836735
Average Error Rate for mpg high = 1:  0.07794684205076571
Average Error Rate for mpg high = 0:  0.1193288810332874




#### Random Forest

In [26]:
auto_df = auto.drop(['name'], axis=1)

In [27]:
Xvars= auto_df[['mpg','cylinders','displacement','horsepower','weight','acceleration','year','origin']]
yvars= auto_df[['mpg_high']]

In [28]:
#Create training and test data
X_train, X_test, y_train, y_test = train_test_split(Xvars, yvars, test_size=0.4)

In [29]:
#Create first Random Forest object
auto_rf = RandomForestClassifier()
auto_rf.fit(X_train,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
#Tune hyperparameters
param_rf = {'n_estimators': [10,200],
                   'max_depth': [3, 8],'min_samples_split': sp_randint(2, 20),
                   'min_samples_leaf': sp_randint(2, 20),'max_features':sp_randint(1,8)}

auto_tune = RandomizedSearchCV(auto_rf, param_distributions=param_rf,
                                    n_iter=100, n_jobs=-1, cv=4,
                                   random_state=25,scoring='neg_mean_squared_error')

In [31]:
auto_tune.fit(X_train,y_train)
print('TunedBestEstimator1=', auto_tune.best_estimator_)
print('TunedBestParams1=', auto_tune.best_params_)
print('TunedBestScore1=', -auto_tune.best_score_)

TunedBestEstimator1= RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=17, min_samples_split=14,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
TunedBestParams1= {'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 17, 'min_samples_split': 14, 'n_estimators': 10}
TunedBestScore1= -0.0


  self.best_estimator_.fit(X, y, **fit_params)


In [32]:
#Run new Random Forest with optimal parameters

auto_rf_opt = RandomForestRegressor(n_estimators=10, max_depth=3, min_samples_split=14, min_samples_leaf=17,
                                     max_features=7, bootstrap=True,
                                     n_jobs=-1, oob_score=True, random_state=25)


auto_rf_opt.fit(X_train,y_train)

y_testpred_rf = auto_rf_opt.predict(X_test)
MSE_randomforest = mean_squared_error(y_test, y_testpred_rf)
print("Test MSE - Random Forest",MSE_randomforest)

  
  warn("Some inputs do not have OOB scores. "


Test MSE - Random Forest 0.002789243878912638


##### SVM

In [33]:
from scipy.stats import uniform as sp_uniform
from sklearn import svm

In [34]:
#Create first SVC object
svc_auto = svm.SVC()
svc_auto.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [35]:
#Tune hyperparameters
param_svc = {'C': sp_uniform(loc=0.2, scale=4.0),
             'gamma': ['scale', 'auto'],
             'shrinking': [True, False]}

svc_tune = RandomizedSearchCV(svc_auto, param_distributions=param_svc,
                                    n_iter=100, n_jobs=-1, cv=4,
                                   random_state=25,scoring='neg_mean_squared_error')

svc_tune.fit(X_train,y_train)

print('TunedBestEstimator1=', svc_tune.best_estimator_)
print('TunedBestParams1=', svc_tune.best_params_)
print('TunedBestScore1=', -svc_tune.best_score_)

TunedBestEstimator1= SVC(C=0.3377990724342859, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=False,
  tol=0.001, verbose=False)
TunedBestParams1= {'C': 0.3377990724342859, 'gamma': 'scale', 'shrinking': False}
TunedBestScore1= 0.10212765957446808


  y = column_or_1d(y, warn=True)


In [36]:
#Run with optimal parameters

svc_opt = svc_auto = svm.SVC(kernel='rbf', gamma='scale', C=3.23683,shrinking=False)
svc_opt.fit(X_train,y_train)

y_testpred_sv = svc_opt.predict(X_test)
MSE_svm = mean_squared_error(y_test, y_testpred_sv)
print("Test MSE",MSE_svm)


Test MSE 0.14012738853503184


  y = column_or_1d(y, warn=True)


In [37]:
results = pd.DataFrame({'Logistic Regression (a)':MSE_logreg_kf,
                        'Random Forest (b)':MSE_randomforest,
                        'Support Vector Machines (c)':MSE_svm},
                       index=['MSE'])
results

Unnamed: 0,Logistic Regression (a),Random Forest (b),Support Vector Machines (c)
MSE,0.09949,0.002789,0.140127
