In [99]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
import sklearn.metrics as metrics
from sklearn import model_selection, linear_model, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve, \
StratifiedKFold, train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, classification_report, average_precision_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

## Numeric Attempt

In [72]:
# Import data
car = pd.read_csv('car.data', names=['buying','maint','doors','persons','lug_boot','safety','class'])
car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [73]:
# As all the columns are categorical, check for unique values of each column
# Check categorical values in each column
for i in car.columns:
    print(car[i].unique(),':',car[i].nunique())

['vhigh' 'high' 'med' 'low'] : 4
['vhigh' 'high' 'med' 'low'] : 4
['2' '3' '4' '5more'] : 4
['2' '4' 'more'] : 3
['small' 'med' 'big'] : 3
['low' 'med' 'high'] : 3
['unacc' 'acc' 'vgood' 'good'] : 4


In [74]:
# Check missing value
car.isna().any()

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
class       False
dtype: bool

In [75]:
# Convert to numeric variable
car.replace({'buying':{'low':0,'med':1/3,'high':2/3, 'vhigh':1}},inplace=True)
car.replace({'maint':{'low':0,'med':1/3,'high':2/3, 'vhigh':1}},inplace=True)
car.replace({'doors':{'2':0,'3':1/3,'4':2/3,'5more':1}},inplace=True)
car.replace({'persons':{'2':0,'4':0.5,'more':1}},inplace=True)
car.replace({'lug_boot':{'small':0,'med':0.5,'big':1}},inplace=True)
car.replace({'safety':{'low':0,'med':0.5,'high':1}},inplace=True)
#car.replace({'class':{'unacc':0,'acc':1/3,'good':2/3,'vgood':1}},inplace=True)
car.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,1.0,1.0,0.0,0.0,0.0,0.0,unacc
1,1.0,1.0,0.0,0.0,0.0,0.5,unacc
2,1.0,1.0,0.0,0.0,0.0,1.0,unacc
3,1.0,1.0,0.0,0.0,0.5,0.0,unacc
4,1.0,1.0,0.0,0.0,0.5,0.5,unacc


### Split Data

In [88]:
X = car.iloc[:,:6]
y = car['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
f_measure_score = {'decision_tree':{},'knn':{},'logistic':{},'NB':{},'svm':{}}

### Decision Tree

In [182]:
# Set up possible values of parameters to optimize over
param_dict={'criterion':['gini','entropy'], 'max_depth':range(1,11), 'min_samples_leaf':range(1,5), 
            'min_samples_split':range(1,10)} 
d_tree = DecisionTreeClassifier(random_state=42)

grid_tree = GridSearchCV(d_tree, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_tree.fit(X_train,y_train)


y_pred_tree = grid_tree.predict(X_test)
nested_score_tree = cross_val_score(grid_tree, X=X, y=y, cv=cv) 
f_measure_score['decision_tree']['mean'] = np.mean(nested_score_tree)
f_measure_score['decision_tree']['std'] = np.std(nested_score_tree)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70268366 0.70268366 0.70268366        nan 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
        nan 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366        nan 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
        nan 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673        nan 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
        nan 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673        nan 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
        nan 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756
 0.78988756 0.78988756 0.78988756        nan 0.78988756 0.78988756
 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756
        nan 0.78988756 0.78988756 0.78988756 0.78988756 0.7898

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796        nan 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
        nan 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796        nan 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
        nan 0.79033085 0.79033085 0.79033085 0.79033085 0.79033085
 0.79033085 0.79033085 0.79033085        nan 0.79033085 0.79033085
 0.79033085 0.79033085 0.79033085 0.79033085 0.79033085 0.79033085
        nan 0.79033085 0.79033085 0.79033085 0.79033085 0.7903

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025        nan 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
        nan 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025        nan 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
        nan 0.79420182 0.79420182 0.79420182 0.79420182 0.79420182
 0.79420182 0.79420182 0.79420182        nan 0.79420182 0.79420182
 0.79420182 0.79420182 0.79420182 0.79420182 0.79420182 0.79420182
        nan 0.79420182 0.79420182 0.79420182 0.79420182 0.7942

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615        nan 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
        nan 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615        nan 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
        nan 0.79421836 0.79421836 0.79421836 0.79421836 0.79421836
 0.79421836 0.79421836 0.79421836        nan 0.79421836 0.79421836
 0.79421836 0.79421836 0.79421836 0.79421836 0.79421836 0.79421836
        nan 0.79421836 0.79421836 0.79421836 0.79421836 0.7942

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581        nan 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
        nan 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581        nan 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
        nan 0.79356493 0.79356493 0.79356493 0.79356493 0.79356493
 0.79356493 0.79356493 0.79356493        nan 0.79356493 0.79356493
 0.79356493 0.79356493 0.79356493 0.79356493 0.79356493 0.79356493
        nan 0.79356493 0.79356493 0.79356493 0.79356493 0.7935

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896        nan 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
        nan 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896        nan 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
        nan 0.78971878 0.78971878 0.78971878 0.78971878 0.78971878
 0.78971878 0.78971878 0.78971878        nan 0.78971878 0.78971878
 0.78971878 0.78971878 0.78971878 0.78971878 0.78971878 0.78971878
        nan 0.78971878 0.78971878 0.78971878 0.78971878 0.7897

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421        nan 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
        nan 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421        nan 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
        nan 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567
 0.79035567 0.79035567 0.79035567        nan 0.79035567 0.79035567
 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567
        nan 0.79035567 0.79035567 0.79035567 0.79035567 0.7903

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837        nan 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
        nan 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837        nan 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
        nan 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567
 0.79035567 0.79035567 0.79035567        nan 0.79035567 0.79035567
 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567 0.79035567
        nan 0.79035567 0.79035567 0.79035567 0.79035567 0.7903

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687        nan 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
        nan 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687        nan 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
        nan 0.79097601 0.79097601 0.79097601 0.79097601 0.79097601
 0.79097601 0.79097601 0.79097601        nan 0.79097601 0.79097601
 0.79097601 0.79097601 0.79097601 0.79097601 0.79097601 0.79097601
        nan 0.79097601 0.79097601 0.79097601 0.79097601 0.7909

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409        nan 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
        nan 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409        nan 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
        nan 0.78986766 0.78986766 0.78986766 0.78986766 0.78986766
 0.78986766 0.78986766 0.78986766        nan 0.78986766 0.78986766
 0.78986766 0.78986766 0.78986766 0.78986766 0.78986766 0.78986766
        nan 0.78986766 0.78986766 0.78986766 0.78986766 0.7898

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649        nan 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
        nan 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649        nan 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
        nan 0.79305624 0.79305624 0.79305624 0.79305624 0.79305624
 0.79305624 0.79305624 0.79305624        nan 0.79305624 0.79305624
 0.79305624 0.79305624 0.79305624 0.79305624 0.79305624 0.79305624
        nan 0.79305624 0.79305624 0.79305624 0.79305624 0.7930

In [183]:
print(classification_report(y_test,y_pred_tree))

              precision    recall  f1-score   support

         acc       0.94      0.88      0.91       129
        good       0.73      0.95      0.83        20
       unacc       0.98      0.99      0.99       397
       vgood       0.83      0.80      0.82        25

    accuracy                           0.96       571
   macro avg       0.87      0.91      0.89       571
weighted avg       0.96      0.96      0.96       571



In [91]:
# View best hyperparameters
grid_tree.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

### Logistic Regression

In [92]:
param_dict = {'C':[0.0001,0.001, 0.01, 1, 0.1, 10, 100, 1000], 'penalty':['l1','l2'],
              'solver':['lbfgs','sag','saga','newton-cg']}

logistic = linear_model.LogisticRegression(random_state=42)

grid_log = GridSearchCV(logistic, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_log.fit(X_train, y_train)

y_pred_log = grid_log.predict(X_test)
nested_score_log = cross_val_score(grid_log, X=X, y=y, cv=cv) 
f_measure_score['logistic']['mean'] = np.mean(nested_score_log)
f_measure_score['logistic']['std'] = np.std(nested_score_log)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70268366 0.70268366        nan        nan 0.70268366        nan
 0.70268366 0.70268366 0.70268366 0.70268366        nan        nan
 0.70268366        nan 0.70268366 0.70268366 0.70268366 0.70268366
        nan        nan 0.81763868        nan 0.81676162 0.81676162
 0.81676162 0.81676162        nan        nan 0.7744003         nan
 0.76493253 0.76493253 0.76493253 0.76493253        nan        nan
 0.82107196        nan 0.82194903 0.82194903 0.82194903 0.82194903
        nan        nan 0.8228036         nan 0.82194153 0.82194153
 0.8228036  0.82194153        nan        nan 0.8228036         nan
 0.82193403 0.82193403 0.8228036  0.82193403]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70418528 0.70418528 0.70418528 0.70418528
        nan        nan 0.82383788        nan 0.81933002 0.81933002
 0.81933002 0.81933002        nan        nan 0.78137304        nan
 0.76786187 0.76786187 0.76786187 0.76786187        nan        nan
 0.83090984        nan 0.82897849 0.82897849 0.82897849 0.82897849
        nan        nan 0.83155087        nan 0.83219603 0.83219603
 0.83284119 0.83219603        nan        nan 0.83155087        nan
 0.83155087 0.83155087 0.83155087 0.83155087]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70546319 0.70546319 0.70546319 0.70546319
        nan        nan 0.82767577        nan 0.82443342 0.82443342
 0.82443342 0.82443342        nan        nan 0.79162531        nan
 0.7742804  0.7742804  0.7742804  0.7742804         nan        nan
 0.83218776        nan 0.83218362 0.83153846 0.83218362 0.83218362
        nan        nan 0.83411911        nan 0.83347395 0.83347395
 0.83283292 0.83347395        nan        nan 0.83411911        nan
 0.83347395 0.83411911 0.83411911 0.83347395]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70483457 0.70483457 0.70483457 0.70483457
        nan        nan 0.82186931        nan 0.81737386 0.81737386
 0.81737386 0.81737386        nan        nan 0.78522333        nan
 0.76722911 0.76722911 0.76722911 0.76722911        nan        nan
 0.8302316         nan 0.82766336 0.82830438 0.82766336 0.82766336
        nan        nan 0.8302316         nan 0.83087262 0.83087262
 0.83087262 0.83087262        nan        nan 0.8302316         nan
 0.8302316  0.8302316  0.8302316  0.8302316 ]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = 

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.7054756  0.7054756  0.7054756  0.7054756
        nan        nan 0.83146816        nan 0.8237469  0.82310587
 0.8237469  0.8237469         nan        nan 0.78708437        nan
 0.77038875 0.77038875 0.77038875 0.77038875        nan        nan
 0.83402399        nan 0.8333871  0.8333871  0.8333871  0.8333871
        nan        nan 0.83531431        nan 0.83531431 0.83531431
 0.83531431 0.83531431        nan        nan 0.83531431        nan
 0.83466915 0.83466915 0.83531431 0.83466915]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70418941 0.70418941 0.70418941 0.70418941
        nan        nan 0.82508685        nan 0.82251861 0.82251861
 0.82251861 0.82251861        nan        nan 0.78457403        nan
 0.76657568 0.76657568 0.76657568 0.76657568        nan        nan
 0.83215054        nan 0.8302316  0.8302316  0.8302316  0.8302316
        nan        nan 0.83215054        nan 0.8327957  0.8327957
 0.8327957  0.8327957         nan        nan 0.83215054        nan
 0.83215054 0.83215054 0.83215054 0.83215054]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _c

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70675352 0.70675352 0.70675352 0.70675352
        nan        nan 0.81872622        nan 0.81742763 0.81742763
 0.81742763 0.81742763        nan        nan 0.78204301        nan
 0.76855252 0.76855252 0.76855252 0.76855252        nan        nan
 0.82578991        nan 0.82578991 0.82578991 0.82578991 0.82578991
        nan        nan 0.82643093        nan 0.82643093 0.82643093
 0.82643093 0.82643093        nan        nan 0.82643093        nan
 0.82643093 0.82643093 0.82643093 0.82643093]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.70676592 0.70676592 0.70676592 0.70676592
        nan        nan 0.8231555         nan 0.81672457 0.81672457
 0.81672457 0.81672457        nan        nan 0.7807196         nan
 0.76465674 0.76465674 0.76465674 0.76465674        nan        nan
 0.82765095        nan 0.82507858 0.82507858 0.82507858 0.82507858
        nan        nan 0.82829611        nan 0.82829611 0.82829611
 0.82829611 0.82829611        nan        nan 0.82829611        nan
 0.82765509 0.82765509 0.82829611 0.82765509]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.7080397  0.7080397  0.7080397  0.7080397
        nan        nan 0.82254342        nan 0.81545906 0.81545906
 0.81545906 0.81545906        nan        nan 0.78458644        nan
 0.77235732 0.77235732 0.77235732 0.77235732        nan        nan
 0.82640612        nan 0.82576096 0.82576096 0.82576096 0.82576096
        nan        nan 0.82897436        nan 0.82897436 0.82897436
 0.82897436 0.82897436        nan        nan 0.82897436        nan
 0.82769231 0.82833333 0.82897436 0.82769231]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.69987593 0.69987593        nan        nan 0.69987593        nan
 0.69987593 0.69987593 0.69987593 0.69987593        nan        nan
 0.69987593        nan 0.70630687 0.70630687 0.70630687 0.70630687
        nan        nan 0.8252316         nan 0.82909016 0.82909016
 0.82909016 0.82909016        nan        nan 0.78795285        nan
 0.77188999 0.77188999 0.77188999 0.77188999        nan        nan
 0.83616625        nan 0.83360215 0.83360215 0.83360215 0.83360215
        nan        nan 0.83617039        nan 0.83617039 0.83617039
 0.83552523 0.83617039        nan        nan 0.83617039        nan
 0.83617039 0.83617039 0.83617039 0.83552936]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.69987593 0.69987593        nan        nan 0.69987593        nan
 0.69987593 0.69987593 0.69987593 0.69987593        nan        nan
 0.69987593        nan 0.70630687 0.70630687 0.70630687 0.70630687
        nan        nan 0.83035567        nan 0.82650538 0.82650538
 0.82650538 0.82650538        nan        nan 0.78731183        nan
 0.77509926 0.77509926 0.77509926 0.77509926        nan        nan
 0.83228288        nan 0.83099256 0.83099256 0.83099256 0.83099256
        nan        nan 0.8329239         nan 0.83228288 0.83228288
 0.83228288 0.83228288        nan        nan 0.83356907        nan
 0.83356907 0.83356907 0.83356907 0.83356907]


In [93]:
print(classification_report(y_test,y_pred_log),'\n')

              precision    recall  f1-score   support

         acc       0.68      0.58      0.63       129
        good       0.67      0.50      0.57        20
       unacc       0.88      0.93      0.90       397
       vgood       0.73      0.76      0.75        25

    accuracy                           0.83       571
   macro avg       0.74      0.69      0.71       571
weighted avg       0.82      0.83      0.82       571
 



In [94]:
# View best hyperparameters
grid_log.best_params_

{'C': 100, 'penalty': 'l1', 'solver': 'saga'}

### KNN

In [96]:
param_dict = {'n_neighbors':list(range(1,31)), 'weights':['uniform', 'distance']}

knn = KNeighborsClassifier()

grid_knn = GridSearchCV(knn, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_knn.fit(X_train,y_train)

y_pred_knn = grid_knn.predict(X_test)
nested_score_knn = cross_val_score(grid_knn, X=X, y=y, cv=cv) 
f_measure_score['knn']['mean'] = np.mean(nested_score_knn)
f_measure_score['knn']['std'] = np.std(nested_score_knn)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [97]:
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

         acc       0.90      0.91      0.91       129
        good       0.83      0.75      0.79        20
       unacc       0.98      0.99      0.98       397
       vgood       1.00      0.80      0.89        25

    accuracy                           0.96       571
   macro avg       0.93      0.86      0.89       571
weighted avg       0.96      0.96      0.96       571



In [98]:
# View best hyperparameters
grid_knn.best_params_

{'n_neighbors': 15, 'weights': 'distance'}

### Naive Bayes

In [102]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
nested_score_nb = cross_val_score(nb, X=X, y=y, cv=cv) 
f_measure_score['NB']['mean'] = np.mean(nested_score_nb)
f_measure_score['NB']['std'] = np.std(nested_score_nb)

In [103]:
print(classification_report(y_test,y_pred_nb),'\n')

              precision    recall  f1-score   support

         acc       0.60      0.20      0.30       129
        good       0.45      0.25      0.32        20
       unacc       0.87      0.86      0.86       397
       vgood       0.20      1.00      0.34        25

    accuracy                           0.70       571
   macro avg       0.53      0.58      0.46       571
weighted avg       0.76      0.70      0.69       571
 



### SVM

In [108]:
param_dict = {'C':[0.1,1,100,1000],'kernel':['rbf','linear'], 'gamma':[1, 0.1, 0.01, 0.001]}

svm = SVC(probability=True)

grid_svm = GridSearchCV(svm, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_svm.fit(X_train, y_train)

y_pred_svm = grid_svm.predict(X_test)
nested_score_svm = cross_val_score(grid_svm, X=X, y=y, cv=cv) 
f_measure_score['svm']['mean'] = np.mean(nested_score_svm)
f_measure_score['svm']['std'] = np.std(nested_score_svm)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [109]:
print(classification_report(y_test,y_pred_svm),'\n')

              precision    recall  f1-score   support

         acc       0.98      0.93      0.96       129
        good       0.83      1.00      0.91        20
       unacc       0.99      1.00      0.99       397
       vgood       1.00      0.96      0.98        25

    accuracy                           0.98       571
   macro avg       0.95      0.97      0.96       571
weighted avg       0.98      0.98      0.98       571
 



In [111]:
# View best hyperparameters
grid_svm.best_params_

{'C': 1000, 'gamma': 1, 'kernel': 'rbf'}

### Models Comparision (numeric)

In [112]:
for k,v in f_measure_score.items():
    print(k, ': ', v)

decision_tree :  {'mean': 0.9733599946229331, 'std': 0.014727376194893544}
knn :  {'mean': 0.9583445355558544, 'std': 0.01432197139682613}
logistic :  {'mean': 0.8309819868261863, 'std': 0.02560389666625801}
NB :  {'mean': 0.6944448178518619, 'std': 0.018954651788038478}
svm :  {'mean': 0.9942028498454093, 'std': 0.008222189044405052}


#### Result: SVM got the best performance in numeric attempt

In [116]:
# Build confusion matrix and calculate other indicators
grid_predictions = grid_svm.predict(X_test)
print(confusion_matrix(y_test,y_pred_svm))
print("")
print(classification_report(y_test,y_pred_svm))

[[120   4   5   0]
 [  0  20   0   0]
 [  1   0 396   0]
 [  1   0   0  24]]

              precision    recall  f1-score   support

         acc       0.98      0.93      0.96       129
        good       0.83      1.00      0.91        20
       unacc       0.99      1.00      0.99       397
       vgood       1.00      0.96      0.98        25

    accuracy                           0.98       571
   macro avg       0.95      0.97      0.96       571
weighted avg       0.98      0.98      0.98       571



## Categorical Attempt

In [188]:
# Import data
car = pd.read_csv('car.data', names=['buying','maint','doors','persons','lug_boot','safety','class'])

In [189]:
# Convert to dummy variable
X = pd.get_dummies(car.iloc[:,:6])
y = car['class']

### Split Data

In [190]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
f_measure_score_c = {'decision_tree':{},'knn':{},'logistic':{},'NB':{},'svm':{}}

### Decision Tree

In [191]:
# Set up possible values of parameters to optimize over
param_dict={'criterion':['gini','entropy'], 'max_depth':range(1,11), 'min_samples_leaf':range(1,5), 
            'min_samples_split':range(1,10)} 
d_tree = DecisionTreeClassifier(random_state=42)

grid_tree = GridSearchCV(d_tree, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_tree.fit(X_train,y_train)


y_pred_tree = grid_tree.predict(X_test)
nested_score_tree = cross_val_score(grid_tree, X=X, y=y, cv=cv) 
f_measure_score_c['decision_tree']['mean'] = np.mean(nested_score_tree)
f_measure_score_c['decision_tree']['std'] = np.std(nested_score_tree)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70268366 0.70268366 0.70268366        nan 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
        nan 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366        nan 0.70268366 0.70268366
 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366 0.70268366
        nan 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673        nan 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
        nan 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673        nan 0.76653673 0.76653673
 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673 0.76653673
        nan 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756
 0.78988756 0.78988756 0.78988756        nan 0.78988756 0.78988756
 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756 0.78988756
        nan 0.78988756 0.78988756 0.78988756 0.78988756 0.7898

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796        nan 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
        nan 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796        nan 0.77682796 0.77682796
 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796 0.77682796
        nan 0.79614971 0.79614971 0.79614971 0.79614971 0.79614971
 0.79614971 0.79614971 0.79614971        nan 0.79614971 0.79614971
 0.79614971 0.79614971 0.79614971 0.79614971 0.79614971 0.79614971
        nan 0.79614971 0.79614971 0.79614971 0.79614971 0.7961

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025        nan 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
        nan 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025        nan 0.78330025 0.78330025
 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025 0.78330025
        nan 0.80132754 0.80132754 0.80132754 0.80132754 0.80132754
 0.80132754 0.80132754 0.80132754        nan 0.80132754 0.80132754
 0.80132754 0.80132754 0.80132754 0.80132754 0.80132754 0.80132754
        nan 0.80132754 0.80132754 0.80132754 0.80132754 0.8013

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615        nan 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
        nan 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615        nan 0.78073615 0.78073615
 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615 0.78073615
        nan 0.78779983 0.78779983 0.78779983 0.78779983 0.78779983
 0.78779983 0.78779983 0.78779983        nan 0.78779983 0.78779983
 0.78779983 0.78779983 0.78779983 0.78779983 0.78779983 0.78779983
        nan 0.78779983 0.78779983 0.78779983 0.78779983 0.7877

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581        nan 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
        nan 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581        nan 0.78133581 0.78133581
 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581 0.78133581
        nan 0.79163772 0.79163772 0.79163772 0.79163772 0.79163772
 0.79163772 0.79163772 0.79163772        nan 0.79163772 0.79163772
 0.79163772 0.79163772 0.79163772 0.79163772 0.79163772 0.79163772
        nan 0.79163772 0.79163772 0.79163772 0.79163772 0.7916

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896        nan 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
        nan 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896        nan 0.77813896 0.77813896
 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896 0.77813896
        nan 0.79166667 0.79166667 0.79166667 0.79166667 0.79166667
 0.79166667 0.79166667 0.79166667        nan 0.79166667 0.79166667
 0.79166667 0.79166667 0.79166667 0.79166667 0.79166667 0.79166667
        nan 0.79166667 0.79166667 0.79166667 0.79166667 0.7916

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421        nan 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
        nan 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421        nan 0.77299421 0.77299421
 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421 0.77299421
        nan 0.79487593 0.79487593 0.79487593 0.79487593 0.79487593
 0.79487593 0.79487593 0.79487593        nan 0.79487593 0.79487593
 0.79487593 0.79487593 0.79487593 0.79487593 0.79487593 0.79487593
        nan 0.79487593 0.79487593 0.79487593 0.79487593 0.7948

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837        nan 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
        nan 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837        nan 0.77232837 0.77232837
 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837 0.77232837
        nan 0.79807692 0.79807692 0.79807692 0.79807692 0.79807692
 0.79807692 0.79807692 0.79807692        nan 0.79807692 0.79807692
 0.79807692 0.79807692 0.79807692 0.79807692 0.79807692 0.79807692
        nan 0.79807692 0.79807692 0.79807692 0.79807692 0.7980

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672        nan 0.70032672 0.70032672
 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672 0.70032672
        nan 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687        nan 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
        nan 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687        nan 0.77941687 0.77941687
 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687 0.77941687
        nan 0.80901985 0.80901985 0.80901985 0.80901985 0.80901985
 0.80901985 0.80901985 0.80901985        nan 0.80901985 0.80901985
 0.80901985 0.80901985 0.80901985 0.80901985 0.80901985 0.80901985
        nan 0.80901985 0.80901985 0.80901985 0.80901985 0.8090

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409        nan 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
        nan 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409        nan 0.77634409 0.77634409
 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409 0.77634409
        nan 0.79628205 0.79628205 0.79628205 0.79628205 0.79628205
 0.79628205 0.79628205 0.79628205        nan 0.79628205 0.79628205
 0.79628205 0.79628205 0.79628205 0.79628205 0.79628205 0.79628205
        nan 0.79628205 0.79628205 0.79628205 0.79628205 0.7962

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593        nan 0.69987593 0.69987593
 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593 0.69987593
        nan 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649        nan 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
        nan 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649        nan 0.77635649 0.77635649
 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649 0.77635649
        nan 0.79886683 0.79886683 0.79886683 0.79886683 0.79886683
 0.79886683 0.79886683 0.79886683        nan 0.79886683 0.79886683
 0.79886683 0.79886683 0.79886683 0.79886683 0.79886683 0.79886683
        nan 0.79886683 0.79886683 0.79886683 0.79886683 0.7988

In [192]:
print(classification_report(y_test,y_pred_tree))

              precision    recall  f1-score   support

         acc       0.94      0.88      0.91       129
        good       0.73      0.95      0.83        20
       unacc       0.98      0.99      0.99       397
       vgood       0.83      0.80      0.82        25

    accuracy                           0.96       571
   macro avg       0.87      0.91      0.89       571
weighted avg       0.96      0.96      0.96       571



In [193]:
# View best hyperparameters
grid_tree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 3}

### Logistic Regression

In [194]:
param_dict = {'C':[0.0001,0.001, 0.01, 1, 0.1, 10, 100, 1000], 'penalty':['l1','l2'],
              'solver':['lbfgs','sag','saga','newton-cg']}

logistic = linear_model.LogisticRegression(random_state=42)

grid_log = GridSearchCV(logistic, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_log.fit(X_train, y_train)

y_pred_log = grid_log.predict(X_test)
nested_score_log = cross_val_score(grid_log, X=X, y=y, cv=cv) 
f_measure_score_c['logistic']['mean'] = np.mean(nested_score_log)
f_measure_score_c['logistic']['std'] = np.std(nested_score_log)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70268366 0.70268366        nan        nan 0.70268366        nan
 0.70268366 0.70268366 0.70268366 0.70268366        nan        nan
 0.70268366        nan 0.71132684 0.71132684 0.71132684 0.71132684
        nan        nan 0.90491754        nan 0.90148426 0.90148426
 0.90148426 0.90148426        nan        nan 0.84270615        nan
 0.8495952  0.8495952  0.8495952  0.8495952         nan        nan
 0.92997001        nan 0.92306597 0.90228636 0.92306597 0.92392804
        nan        nan 0.93342579        nan 0.93256372 0.90317091
 0.93256372 0.93256372        nan        nan 0.93342579        nan
 0.93256372 0.91438531 0.93342579 0.93256372]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73183209 0.73183209 0.73183209 0.73183209
        nan        nan 0.91639371        nan 0.90354012 0.90354012
 0.90354012 0.90354012        nan        nan 0.85078164        nan
 0.85980976 0.85980976 0.85980976 0.85980976        nan        nan
 0.9298842         nan 0.92796526 0.90802316 0.92732423 0.92796526
        nan        nan 0.93180314        nan 0.93052109 0.91318031
 0.93052109 0.93244417        nan        nan 0.93115798        nan
 0.93308933 0.92026882 0.93180314 0.93308933]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = 

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73312242 0.73312242 0.73312242 0.73312242
        nan        nan 0.91641853        nan 0.91062035 0.91126137
 0.91062035 0.91062035        nan        nan 0.85145161        nan
 0.85979322 0.85979322 0.85979322 0.85979322        nan        nan
 0.93248553        nan 0.92798594 0.91061208 0.92669975 0.92862696
        nan        nan 0.93377585        nan 0.93377585 0.93120347
 0.93377585 0.93377585        nan        nan 0.93377585        nan
 0.93378412 0.92861456 0.93377585 0.93313896]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = 

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.7343962  0.7343962  0.7343962  0.7343962
        nan        nan 0.91383375        nan 0.90677006 0.90741108
 0.90677006 0.90677006        nan        nan 0.85595947        nan
 0.8610794  0.8610794  0.8610794  0.8610794         nan        nan
 0.93184036        nan 0.92991315 0.90736146 0.92991315 0.92991315
        nan        nan 0.93119934        nan 0.93055418 0.92028122
 0.9311952  0.93119934        nan        nan 0.93119934        nan
 0.93312655 0.926067   0.93119934 0.93248553]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73504549 0.73504549 0.73504549 0.73504549
        nan        nan 0.91770471        nan 0.91318859 0.91254756
 0.91318859 0.91318859        nan        nan 0.8553019         nan
 0.86108354 0.86108354 0.86108354 0.86108354        nan        nan
 0.93311828        nan 0.93247725 0.90485112 0.93183623 0.93247725
        nan        nan 0.93311828        nan 0.9344086  0.93056658
 0.93376344 0.93376344        nan        nan 0.93375931        nan
 0.93440447 0.93182382 0.93311828 0.93440447]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = 

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73633995 0.73633995 0.73633995 0.73633995
        nan        nan 0.9138172         nan 0.90932589 0.90932589
 0.90932589 0.90932589        nan        nan 0.84950372        nan
 0.85656741 0.85656741 0.85656741 0.85656741        nan        nan
 0.92924318        nan 0.92538875 0.91257651 0.92602978 0.92602978
        nan        nan 0.93052109        nan 0.93117039 0.91253102
 0.93052523 0.93052109        nan        nan 0.93052109        nan
 0.93181555 0.92215881 0.93052109 0.93245658]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73054177 0.73054177 0.73054177 0.73054177
        nan        nan 0.91832093        nan 0.90866832 0.90930935
 0.90866832 0.90866832        nan        nan 0.85529777        nan
 0.86300248 0.86300248 0.86300248 0.86300248        nan        nan
 0.9279818         nan 0.92733664 0.91385856 0.92734078 0.92734078
        nan        nan 0.92862283        nan 0.92862283 0.90289495
 0.92862283 0.92862283        nan        nan 0.92862283        nan
 0.92669148 0.92093052 0.92862283 0.92669148]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.72991729 0.72991729 0.72991729 0.72991729
        nan        nan 0.91380893        nan 0.90673284 0.90545079
 0.90673284 0.90673284        nan        nan 0.85398263        nan
 0.85528122 0.85528122 0.85528122 0.85528122        nan        nan
 0.9305335         nan 0.93117866 0.91060794 0.9305335  0.93117866
        nan        nan 0.93117866        nan 0.93181969 0.92669148
 0.9305335  0.93117866        nan        nan 0.93117866        nan
 0.93246071 0.92666667 0.93117866 0.93310174]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.70032672 0.70032672        nan        nan 0.70032672        nan
 0.70032672 0.70032672 0.70032672 0.70032672        nan        nan
 0.70032672        nan 0.73377171 0.73377171 0.73377171 0.73377171
        nan        nan 0.91444169        nan 0.91057899 0.91122002
 0.91057899 0.91057899        nan        nan 0.85656741        nan
 0.85141853 0.85141853 0.85141853 0.85141853        nan        nan
 0.93372622        nan 0.93180314 0.91058726 0.93180314 0.93180314
        nan        nan 0.93501241        nan 0.93436725 0.91701406
 0.93501241 0.93501241        nan        nan 0.93501241        nan
 0.93565343 0.92086849 0.93501241 0.93565757]


Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.69987593 0.69987593        nan        nan 0.69987593        nan
 0.69987593 0.69987593 0.69987593 0.69987593        nan        nan
 0.69987593        nan 0.73522333 0.73522333 0.73522333 0.73522333
        nan        nan 0.9203019         nan 0.91901572 0.91901572
 0.91901572 0.91901572        nan        nan 0.85028536        nan
 0.86247312 0.86247312 0.86247312 0.86247312        nan        nan
 0.93251861        nan 0.93123242 0.90805624 0.93123242 0.93123242
        nan        nan 0.93251861        nan 0.93251447 0.92032672
 0.93315964 0.93251861        nan        nan 0.93251861        nan
 0.93508685 0.93057072 0.93251861 0.93508685]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = 

Fitting 10 folds for each of 64 candidates, totalling 640 fits


 0.69987593 0.69987593        nan        nan 0.69987593        nan
 0.69987593 0.69987593 0.69987593 0.69987593        nan        nan
 0.69987593        nan 0.73651365 0.73651365 0.73651365 0.73651365
        nan        nan 0.91647643        nan 0.91198098 0.91198098
 0.91198098 0.91198098        nan        nan 0.85541356        nan
 0.85992142 0.85992142 0.85992142 0.85992142        nan        nan
 0.93445823        nan 0.93059553 0.9177378  0.92995037 0.93059553
        nan        nan 0.93510339        nan 0.93381307 0.92224979
 0.93445823 0.93445823        nan        nan 0.93510339        nan
 0.93445823 0.92672457 0.93510339 0.93509926]


In [195]:
print(classification_report(y_test,y_pred_log),'\n')

              precision    recall  f1-score   support

         acc       0.91      0.82      0.86       129
        good       0.72      0.90      0.80        20
       unacc       0.97      0.97      0.97       397
       vgood       0.80      0.96      0.87        25

    accuracy                           0.94       571
   macro avg       0.85      0.91      0.88       571
weighted avg       0.94      0.94      0.94       571
 



In [196]:
# View best hyperparameters
grid_log.best_params_

{'C': 100, 'penalty': 'l1', 'solver': 'saga'}

### KNN

In [197]:
param_dict = {'n_neighbors':list(range(1,31)), 'weights':['uniform', 'distance']}

knn = KNeighborsClassifier()

grid_knn = GridSearchCV(knn, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_knn.fit(X_train,y_train)

y_pred_knn = grid_knn.predict(X_test)
nested_score_knn = cross_val_score(grid_knn, X=X, y=y, cv=cv) 
f_measure_score_c['knn']['mean'] = np.mean(nested_score_knn)
f_measure_score_c['knn']['std'] = np.std(nested_score_knn)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits
Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [198]:
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

         acc       0.80      0.86      0.83       129
        good       0.57      0.20      0.30        20
       unacc       0.96      0.99      0.98       397
       vgood       0.93      0.56      0.70        25

    accuracy                           0.92       571
   macro avg       0.82      0.65      0.70       571
weighted avg       0.91      0.92      0.91       571



In [199]:
# View best hyperparameters
grid_knn.best_params_

{'n_neighbors': 9, 'weights': 'distance'}

### Naive Bayes

In [200]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
nested_score_nb = cross_val_score(nb, X=X, y=y, cv=cv) 
f_measure_score_c['NB']['mean'] = np.mean(nested_score_nb)
f_measure_score_c['NB']['std'] = np.std(nested_score_nb)

In [201]:
print(classification_report(y_test,y_pred_nb),'\n')

              precision    recall  f1-score   support

         acc       0.59      0.77      0.67       129
        good       0.44      0.85      0.58        20
       unacc       1.00      0.83      0.90       397
       vgood       0.68      1.00      0.81        25

    accuracy                           0.82       571
   macro avg       0.68      0.86      0.74       571
weighted avg       0.87      0.82      0.84       571
 



### SVM

In [202]:
param_dict = {'C':[0.1,1,100,1000],'kernel':['rbf','linear'], 'gamma':[1, 0.1, 0.01, 0.001]}

svm = SVC(probability=True)

grid_svm = GridSearchCV(svm, param_dict, cv=cv, n_jobs=-1, verbose=1)
grid_svm.fit(X_train, y_train)

y_pred_svm = grid_svm.predict(X_test)
nested_score_svm = cross_val_score(grid_svm, X=X, y=y, cv=cv) 
f_measure_score_c['svm']['mean'] = np.mean(nested_score_svm)
f_measure_score_c['svm']['std'] = np.std(nested_score_svm)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [203]:
print(classification_report(y_test,y_pred_svm),'\n')

              precision    recall  f1-score   support

         acc       0.98      0.97      0.98       129
        good       0.86      0.95      0.90        20
       unacc       1.00      1.00      1.00       397
       vgood       0.92      0.92      0.92        25

    accuracy                           0.99       571
   macro avg       0.94      0.96      0.95       571
weighted avg       0.99      0.99      0.99       571
 



In [204]:
# View best hyperparameters
grid_svm.best_params_

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

### Models Comparision (categorical)

In [205]:
for a,b in f_measure_score_c.items():
    print(a, ': ', b)

decision_tree :  {'mean': 0.972788681274365, 'std': 0.011899684402510432}
knn :  {'mean': 0.9317112515123, 'std': 0.0171712971111848}
logistic :  {'mean': 0.9346014249227046, 'std': 0.01830887424272436}
NB :  {'mean': 0.8026582874042208, 'std': 0.03286866941083019}
svm :  {'mean': 0.9982591746202447, 'std': 0.0026591688489214986}


#### Result: SVM outperforms other models in categorical attempt
#### We should treat variables as categorical. Because SVM model would calculate the distance using the method of numeric data, but distance between categories cannot be measured. Thus, it's not a good method to use numeric variable in this kind of cases, although the outcome seems exceptional.