<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:35px; font-family:Andale Mono,monospace;">Assignmet 07</h3>
    </center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Training model used in this assignment are </font>
    <ul style="font-size:20px; font-family:Andale Mono,monospace;">
        <li>K-Nearest Neighbors Classifier</li>
        <li>Support Vector Classifier</li>
    </ul>
</div>

In [1]:
# python 3.7.3
import itertools, csv

# numpy  1.17.1
import numpy as np

# pandas  0.25.1
import pandas as pd
pd.set_option('display.max_columns', 999)

# scikit-learn  0.21.3
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# matplotlib  3.1.1
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
matplotlib.rcParams.update({'font.size': 22})

# load utility classes/functions that has been taught in previous labs
# e.g., plot_decision_regions()
from lib import *

<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Load datasets </font>
</div>

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'
                 'mushroom/agaricus-lepiota.data', header=None, engine='python')
column_name = ['classes','cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']
df.columns = column_name
df.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [3]:
df.values.shape

(8124, 23)

<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:25px; font-family:Andale Mono,monospace;">Data Preprocessing</h3>
    </center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Check the missing values in each features of datasets </font>
</div>

In [4]:
# count the number of missing values per column
df_NullCheck = df.replace('?', np.nan)
display(df_NullCheck.isnull().sum())

classes                        0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Unfortunately, it does exist some missing values data, which means we have to deal something with them.</font>
    <br />
    <font style="font-size:20px; font-family:Andale Mono,monospace;">First, encode each categorical values into numerical values</font>
</div>

In [5]:
# encode label first
label_le = LabelEncoder()
df['classes'] = label_le.fit_transform(df['classes'].values)

# encode categorical features
catego_features = ['cap-shape', 'cap-surface','cap-color','bruises?','odor',
               'gill-attachment','gill-spacing','gill-size','gill-color',
               'stalk-shape','stalk-root','stalk-surface-above-ring',
               'stalk-surface-below-ring','stalk-color-above-ring',
               'stalk-color-below-ring','veil-type','veil-color','ring-number',
               'ring-type','spore-print-color','population','habitat']

catego_le = LabelEncoder()

# transform categorical values into numerical values
# be careful that '?' will also be encoded
# we have to replace it to NaN in numerical
categories = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)
    
    # store the total number of values
    categories.append(np.arange(len(classes_list)))

display(df.head())

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,0,3.0,2,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,0,2.0,2,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,0,2.0,2,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,0,3.0,2,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,1,3.0,2,2,7,7,0,2,1,0,3,0,1


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Since there are 2480 missing values data and the whole datasets consist only 8120 data, it might result model misled if we just drop all the missing values datasets by row. There is another option to deal with the missing values data. Because of mean value is very sensitive to outliers, so I decided to fill in the missing values by the median of that feature instead of mean value. </font>
</div>

In [6]:
# fill in missing values with medians of the feature 
imr = SimpleImputer(missing_values = np.NaN, strategy = 'median')
imr = imr.fit(df.values)
data = imr.transform(df.values)

df_imputer = pd.DataFrame(data=data, columns = column_name)
df_imputer.head()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1.0,5.0,2.0,4.0,1.0,6.0,1.0,0.0,1.0,4.0,0.0,3.0,2.0,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
1,0.0,5.0,2.0,9.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,2.0,2.0,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,1.0
2,0.0,0.0,2.0,8.0,1.0,3.0,1.0,0.0,0.0,5.0,0.0,2.0,2.0,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,3.0
3,1.0,5.0,3.0,8.0,1.0,6.0,1.0,0.0,1.0,5.0,0.0,3.0,2.0,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
4,0.0,5.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,4.0,1.0,3.0,2.0,2.0,7.0,7.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Now, Check the missing values in each features of imputed datasets </font>
</div>

In [7]:
display(df_imputer.isnull().sum())

classes                     0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">So, there is no any missing value in the whole datasets </font>
    <br />
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Split the feature 'classes' as y label, and the other features as X label </font>
</div>

In [8]:
X = df_imputer.drop('classes', axis=1).values
y = df_imputer['classes'].values

<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Do one hot encoding</font>
</div>

In [9]:
dfX = df_imputer[catego_features]

# find the index of the categorical feature
catego_features_idx = []
for fea in catego_features:
    catego_features_idx.append(dfX.columns.tolist().index(fea))

# give the column index you want to do one-hot encoding
ohe = ColumnTransformer(
    [
        ("one_hot_encoder", OneHotEncoder(sparse = False, categories = "auto"), catego_features_idx)
    ],
    remainder = "passthrough"
)

X_orig = X.copy()
X = ohe.fit_transform(dfX.values)
df_oneHot = pd.DataFrame(data=X)
print("Before One Hot Encoding : " + str(X_orig.shape))
print("After One Hot Encoding : " + str(X.shape))

Before One Hot Encoding : (8124, 22)
After One Hot Encoding : (8124, 116)


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Split the datasets into training datasets and testing datasets</font>
</div>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

In [11]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:25px; font-family:Andale Mono,monospace;">K-Nearest Neighbors Classifier</h3>
    </center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">1. Just with KNN Only</font>
</div>

In [12]:
neighbors = [1, 11, 15, 20]
best_acc = -1
best_ng = -1
best_knn = -1

for ng in neighbors:
    # p=2 and metric='minkowski' means the Euclidean Distance
    knn = KNeighborsClassifier(n_neighbors=ng, p=2, metric='minkowski')

    knn.fit(X_train_std, y_train)
    y_pred = knn.predict(X_test_std)
    
    acc = accuracy_score(y_test, y_pred)
    
    print('[KNN with %d neighbors]' %ng)
    print('Misclassified samples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.2f' % acc)
    
    if acc > best_acc:
        best_acc, best_ng, best_knn = acc, ng, knn

[KNN with 1 neighbors]
Misclassified samples: 0
Accuracy: 1.00
[KNN with 11 neighbors]
Misclassified samples: 5
Accuracy: 1.00
[KNN with 15 neighbors]
Misclassified samples: 5
Accuracy: 1.00
[KNN with 20 neighbors]
Misclassified samples: 5
Accuracy: 1.00


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">2. Pipeline</font>
</div>

In [13]:
# define pipeline with an arbitrary number of transformer in a tuple array
pipe_knn = Pipeline([
    ("scl", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors = best_ng, p = 2, metric = "minkowski"))
])

# use the pipeline model to train
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('[KNN with Pipeline]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))

[KNN with Pipeline]
Misclassified samples: 0
Accuracy: 1.0000


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Surprisingly, its testing accuracy is 100% ?!</font>
</div>

<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:25px; font-family:Andale Mono,monospace;">Support Vector Classifier</h3>
    </center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">1. Linear SVC version</font>
</div>

In [14]:
Cparams = [0.1, 1.0, 10.0, 1000.0, 10000.0]
best_acc = -1
best_c_li = -1
best_svmli = -1

for cp in Cparams:
    # kernel: the kernel function, can be 'linear', 'poly', 'rbf', ...etc
    # C is the hyperparameter for the error penalty term
    svm_linear = SVC(kernel='linear', C=cp, random_state=0)

    svm_linear.fit(X_train_std, y_train)
    y_pred = svm_linear.predict(X_test_std)
    acc = accuracy_score(y_test, y_pred)
    
    print('[Linear SVC with %.1f C]' % cp)
    print('Misclassified samples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.2f' % acc)
    
    if acc > best_acc:
        best_acc, best_c_li, best_svmli = acc, cp, svm_linear

[Linear SVC with 0.1 C]
Misclassified samples: 0
Accuracy: 1.00
[Linear SVC with 1.0 C]
Misclassified samples: 0
Accuracy: 1.00
[Linear SVC with 10.0 C]
Misclassified samples: 0
Accuracy: 1.00
[Linear SVC with 1000.0 C]
Misclassified samples: 0
Accuracy: 1.00
[Linear SVC with 10000.0 C]
Misclassified samples: 0
Accuracy: 1.00


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">2. NonLinear SVC version</font>
</div>

In [15]:
Cparams = [0.1, 1.0, 10.0, 1000.0, 10000.0]
best_acc = -1
best_c_rbf = -1
best_svmrbf = -1

for cp in Cparams:
    # C is the hyperparameter for the error penalty term
    # gamma is the hyperparameter for the rbf kernel
    svm_rbf = SVC(kernel='rbf', random_state=0, gamma=0.2, C=cp)

    svm_rbf.fit(X_train_std, y_train)
    y_pred = svm_rbf.predict(X_test_std)
    acc = accuracy_score(y_test, y_pred)
    
    print('[Nonlinear SVC with %.1f C]' %cp)
    print('Misclassified samples: %d' % (y_test != y_pred).sum())
    print('Accuracy: %.2f' % acc)
    
    if acc > best_acc:
        best_acc, best_c_rbf, best_svmrbf = acc, cp, svm_rbf

[Nonlinear SVC with 0.1 C]
Misclassified samples: 805
Accuracy: 0.50
[Nonlinear SVC with 1.0 C]
Misclassified samples: 15
Accuracy: 0.99
[Nonlinear SVC with 10.0 C]
Misclassified samples: 15
Accuracy: 0.99
[Nonlinear SVC with 1000.0 C]
Misclassified samples: 15
Accuracy: 0.99
[Nonlinear SVC with 10000.0 C]
Misclassified samples: 15
Accuracy: 0.99


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">3. Nonlinear SVC gridsearch version</font>
</div>

In [16]:
param_C = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
param_gamma = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]
svm = SVC(random_state=0)

# set the param_grid parameter of GridSearchCV to a list of dictionaries
param_grid = [{'C': param_C, 
               'gamma': param_gamma, 
               'kernel': ['rbf']}]
gs = GridSearchCV(estimator=svm, 
                  param_grid=param_grid, 
                  scoring='accuracy')

gs = gs.fit(X_train_std, y_train)
clf = gs.best_estimator_
clf.fit(X_train_std, y_train)

print("Best parameters :" + str(gs.best_params_))
print('\n[Nonlinear SVC: grid search]')
print('Test accuracy: %.2f' % clf.score(X_test_std, y_test))



Best parameters :{'C': 10.0, 'gamma': 0.001, 'kernel': 'rbf'}

[Nonlinear SVC: grid search]
Test accuracy: 1.00


<div style="background-color:#F0F8FF;">
    <font style="font-size:20px; font-family:Andale Mono,monospace;">4. Pipeline</font>
</div>

In [17]:
pipe_svm = Pipeline([
    ('scl', StandardScaler()),
    ('clf', SVC(kernel='rbf', random_state=0, gamma=0.001, C=10.0))
])

pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)

print('\n[SVC with Pipeline]')
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))


[SVC with Pipeline]
Misclassified samples: 2
Accuracy: 0.9988


<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:25px; font-family:Andale Mono,monospace;">Summary</h3>
    </center>
</div>

<font style="font-size:20px; font-family:Andale Mono,monospace;">BEST ACCURACY OF EACH METHOD</font>
<table style="float:left; width:70%; font-size:16px;">
    <tr>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b></b>
        </th>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>K-Nearest Neighbors Classifier</b>
        </th>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>Support Vector Classifier</b>
        </th>
    </tr>
    <tr>
        <td style="background-color:#2E8B57;text-align:center;border:#006400 1px solid; color:white;">
            <b>Basic</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>100%</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>100%(linear)</b>
            <br />
            <b>99%(non-linear)</b>
        </td>
    </tr>
    <tr>
        <td style="background-color:#2E8B57;text-align:center;border:#006400 1px solid; color:white;">
            <b>Gridsearch</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>&#128683;</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>100%(non-linear)</b>
        </td>
    </tr>
    <tr>
        <td style="background-color:#2E8B57;text-align:center;border:#006400 1px solid; color:white;">
            <b>Pipeline</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>100%</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>100%(non-linear)</b>
        </td>
    </tr>
</table>

<div style="background-color:#F0F8FF;">
    <center>
        <h3 style="color:purple; font-size:25px; font-family:Andale Mono,monospace;">Conclusion</h3>
    </center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">Well, since both KNN and SVM have done a great accuracy on this datasets, it will be hard to decide which model should be the model that achieve the best accuracy. </font>
    <br/>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">However, KNN takes less time compare to SVM, and they came out similar accuracy at the end, so KNN is the best model in this case.</font>
</div>

<center>
    <font style="font-size:20px; font-family:Andale Mono,monospace;">EXECUTION TIME</font>
</center>
<table style="float:left; width:70%; font-size:16px;">
    <tr>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>Core</b>
        </th>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>Real</b>
        </th>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>User</b>
        </th>
        <th style="color:white; background-color:#2E8B57;text-align:center;border:#006400 1px solid;">
            <b>Sys</b>
        </th>
    </tr>
    <tr>
        <td style="background-color:#2E8B57;text-align:center;border:#006400 1px solid; color:white;">
            <b>1</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>1m51.381s</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>0m45.852s</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>0m1.268s</b>
        </td>
    </tr>
    <tr>
        <td style="background-color:#2E8B57;text-align:center;border:#006400 1px solid; color:white;">
            <b>2</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>1m51.554s</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>0m45.872s</b>
        </td>
        <td style="background-color:#90EE90;text-align:center;border:#006400 1px solid;">
            <b>0m1.368s</b>
        </td>
    </tr>
</table>