In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
import sklearn.metrics as metrics
from sklearn import model_selection, linear_model, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve, \
StratifiedKFold, train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, classification_report, average_precision_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import data
car = pd.read_csv('car.data', names=['buying','maint','doors','persons','lug_boot','safety','class'])

In [4]:
# Convert to dummy variable
X = pd.get_dummies(car.iloc[:,:6])
y = car['class']

### Split Data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
f_measure_score_c = {'decision_tree':{},'knn':{},'logistic':{},'NB':{},'svm':{}}

### Decision Tree

In [6]:
# Set up possible values of parameters to optimize over
param_dict={'criterion':['gini','entropy'], 'max_depth':range(1,11), 'min_samples_leaf':range(1,5), 
            'min_samples_split':range(1,10)} 
d_tree = DecisionTreeClassifier(random_state=42)

grid_tree = GridSearchCV(d_tree, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_tree.fit(X_train,y_train)


y_pred_tree = grid_tree.predict(X_test)
nested_score_tree = cross_val_score(grid_tree, X=X, y=y, cv=cv) 
f_measure_score_c['decision_tree']['mean'] = np.mean(nested_score_tree)
f_measure_score_c['decision_tree']['std'] = np.std(nested_score_tree)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits
Fitting 10 folds for each of 720 candidates, totalling 7200 fits


In [7]:
print(classification_report(y_test,y_pred_tree))

              precision    recall  f1-score   support

         acc       0.94      0.88      0.91       129
        good       0.73      0.95      0.83        20
       unacc       0.98      0.99      0.99       397
       vgood       0.83      0.80      0.82        25

    accuracy                           0.96       571
   macro avg       0.87      0.91      0.89       571
weighted avg       0.96      0.96      0.96       571



In [8]:
# View best hyperparameters
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 3}

### Logistic Regression

In [9]:
param_dict = {'C':[0.0001,0.001, 0.01, 1, 0.1, 10, 100, 1000], 'penalty':['l1','l2'],
              'solver':['lbfgs','sag','saga','newton-cg']}

logistic = linear_model.LogisticRegression(random_state=42)

grid_log = GridSearchCV(logistic, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_log.fit(X_train, y_train)

y_pred_log = grid_log.predict(X_test)
nested_score_log = cross_val_score(grid_log, X=X, y=y, cv=cv) 
f_measure_score_c['logistic']['mean'] = np.mean(nested_score_log)
f_measure_score_c['logistic']['std'] = np.std(nested_score_log)

Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits
Fitting 10 folds for each of 64 candidates, totalling 640 fits


In [10]:
print(classification_report(y_test,y_pred_log),'\n')

              precision    recall  f1-score   support

         acc       0.91      0.82      0.86       129
        good       0.72      0.90      0.80        20
       unacc       0.97      0.97      0.97       397
       vgood       0.80      0.96      0.87        25

    accuracy                           0.94       571
   macro avg       0.85      0.91      0.88       571
weighted avg       0.94      0.94      0.94       571
 



In [11]:
# View best hyperparameters
grid_log.best_params_

{'C': 100, 'penalty': 'l1', 'solver': 'saga'}

### KNN

In [12]:
param_dict = {'n_neighbors':list(range(1,31)), 'weights':['uniform', 'distance']}

knn = KNeighborsClassifier()

grid_knn = GridSearchCV(knn, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_knn.fit(X_train,y_train)

y_pred_knn = grid_knn.predict(X_test)
nested_score_knn = cross_val_score(grid_knn, X=X, y=y, cv=cv) 
f_measure_score_c['knn']['mean'] = np.mean(nested_score_knn)
f_measure_score_c['knn']['std'] = np.std(nested_score_knn)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [13]:
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

         acc       0.80      0.86      0.83       129
        good       0.57      0.20      0.30        20
       unacc       0.96      0.99      0.98       397
       vgood       0.93      0.56      0.70        25

    accuracy                           0.92       571
   macro avg       0.82      0.65      0.70       571
weighted avg       0.91      0.92      0.91       571



In [14]:
# View best hyperparameters
grid_knn.best_params_

{'n_neighbors': 9, 'weights': 'distance'}

### Naive Bayes

In [15]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
nested_score_nb = cross_val_score(nb, X=X, y=y, cv=cv) 
f_measure_score_c['NB']['mean'] = np.mean(nested_score_nb)
f_measure_score_c['NB']['std'] = np.std(nested_score_nb)

In [16]:
print(classification_report(y_test,y_pred_nb),'\n')

              precision    recall  f1-score   support

         acc       0.59      0.77      0.67       129
        good       0.44      0.85      0.58        20
       unacc       1.00      0.83      0.90       397
       vgood       0.68      1.00      0.81        25

    accuracy                           0.82       571
   macro avg       0.68      0.86      0.74       571
weighted avg       0.87      0.82      0.84       571
 



### SVM

In [17]:
param_dict = {'C':[0.1,1,100,1000],'kernel':['rbf','linear'], 'gamma':[1, 0.1, 0.01, 0.001]}

svm = SVC(probability=True)

grid_svm = GridSearchCV(svm, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_svm.fit(X_train, y_train)

y_pred_svm = grid_svm.predict(X_test)
nested_score_svm = cross_val_score(grid_svm, X=X, y=y, cv=cv) 
f_measure_score_c['svm']['mean'] = np.mean(nested_score_svm)
f_measure_score_c['svm']['std'] = np.std(nested_score_svm)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [18]:
print(classification_report(y_test,y_pred_svm),'\n')

              precision    recall  f1-score   support

         acc       0.98      0.97      0.98       129
        good       0.86      0.95      0.90        20
       unacc       1.00      1.00      1.00       397
       vgood       0.92      0.92      0.92        25

    accuracy                           0.99       571
   macro avg       0.94      0.96      0.95       571
weighted avg       0.99      0.99      0.99       571
 



In [19]:
# View best hyperparameters
grid_svm.best_params_

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

### Models Comparision 

In [20]:
for a,b in f_measure_score_c.items():
    print(a, ': ', b)

decision_tree :  {'mean': 0.972788681274365, 'std': 0.011899684402510432}
knn :  {'mean': 0.9317112515123, 'std': 0.0171712971111848}
logistic :  {'mean': 0.9346014249227046, 'std': 0.01830887424272436}
NB :  {'mean': 0.8026582874042208, 'std': 0.03286866941083019}
svm :  {'mean': 0.9982591746202447, 'std': 0.0026591688489214986}


#### Result: SVM outperformed other models.  I used it as my final model to further know the hyperparameter and performance matrix.

In [21]:
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)
print('best params: ', grid_svm.best_params_)
print('best score: ', grid_svm.best_score_)
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Fitting 10 folds for each of 32 candidates, totalling 320 fits
best params:  {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
best score:  0.9982683658170914
[[125   3   0   1]
 [  0  19   0   1]
 [  0   0 397   0]
 [  2   0   0  23]]
              precision    recall  f1-score   support

         acc       0.98      0.97      0.98       129
        good       0.86      0.95      0.90        20
       unacc       1.00      1.00      1.00       397
       vgood       0.92      0.92      0.92        25

    accuracy                           0.99       571
   macro avg       0.94      0.96      0.95       571
weighted avg       0.99      0.99      0.99       571

