# Support Vector Machine
#### Name: 沈家駿
#### Student ID: 0712223

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Data Input

In [2]:
df = pd.read_csv('ionosphere.data', sep=',', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


## Data Preprocessing
### Divide the data into X(features) and y(labels).

In [3]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Support Vector Machine
### Train-Test-Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

### Define a function to show the metrics

In [5]:
def show_metrics(metrics, lable):
    cm, acc, pc, recall = metrics
    num_classes = pc.size
    cm_row = list(map(lambda x: 'Predicted '+x, list(lable)))
    cm_col = list(map(lambda x: 'Actual '+x, list(lable)))
    
    display(pd.DataFrame(cm, index=cm_row, columns=cm_col))
    display(pd.DataFrame([acc], index=['Accuracy'], columns=['Avg.']))
    display(pd.DataFrame(pc.reshape(1, num_classes), index=['Precision'], columns=lable))
    display(pd.DataFrame(recall.reshape(1, num_classes), index=['Recall'], columns=lable))

### Linear kernel

In [6]:
svc_linear = SVC(kernel='linear')
svc_linear.fit(X_train, y_train)
y_pred = svc_linear.predict(X_test)

cm = sklearn.metrics.confusion_matrix(y_test, y_pred).transpose()
acc = sklearn.metrics.accuracy_score(y_test, y_pred)
precision, recall, _, _ = sklearn.metrics.precision_recall_fscore_support(y_test, y_pred)

print('Results:')
print('Confusion matrix')
show_metrics((cm, acc, precision, recall), ['Bad', 'Good'])

Results:
Confusion matrix


Unnamed: 0,Actual Bad,Actual Good
Predicted Bad,26,0
Predicted Good,10,70


Unnamed: 0,Avg.
Accuracy,0.90566


Unnamed: 0,Bad,Good
Precision,1.0,0.875


Unnamed: 0,Bad,Good
Recall,0.722222,1.0


### Polynomial kernel
#### Parameter Search

In [7]:
poly_parameters = {'kernel':['poly'], 'coef0':[0.1, 1, 10], 'degree':[2, 3, 4], 'gamma':[0.1, 0.5, 1]}
Grid_poly = GridSearchCV(SVC(), poly_parameters)
Grid_poly.fit(X_train, y_train)

test_score_poly = Grid_poly.cv_results_['mean_test_score']

rank_test_score = Grid_poly.cv_results_['rank_test_score']
best_idx = np.where(rank_test_score == 1)[0][0]
best_params = Grid_poly.cv_results_['params'][best_idx]

print('Best params: ', best_params)

Best params:  {'coef0': 0.1, 'degree': 2, 'gamma': 0.5, 'kernel': 'poly'}


#### Results

In [8]:
svc_poly = SVC(**best_params)
svc_poly.fit(X_train, y_train)
y_pred = svc_poly.predict(X_test)

cm = sklearn.metrics.confusion_matrix(y_test, y_pred).transpose()
acc = sklearn.metrics.accuracy_score(y_test, y_pred)
precision, recall, _, _ = sklearn.metrics.precision_recall_fscore_support(y_test, y_pred)

print('Results:')
print('Confusion matrix')
show_metrics((cm, acc, precision, recall), ['Bad', 'Good'])

Results:
Confusion matrix


Unnamed: 0,Actual Bad,Actual Good
Predicted Bad,25,0
Predicted Good,11,70


Unnamed: 0,Avg.
Accuracy,0.896226


Unnamed: 0,Bad,Good
Precision,1.0,0.864198


Unnamed: 0,Bad,Good
Recall,0.694444,1.0


### RBF kernel
#### Parameter Search

In [9]:
rbf_parameters = {'kernel':['rbf'], 'gamma':[0.1, 0.5, 1]}
Grid_rbf = GridSearchCV(SVC(), rbf_parameters)
Grid_rbf.fit(X_train, y_train)

test_score_rbf = Grid_rbf.cv_results_['mean_test_score']

rank_test_score = Grid_rbf.cv_results_['rank_test_score']
best_idx = np.where(rank_test_score == 1)[0][0]
best_params = Grid_rbf.cv_results_['params'][best_idx]

print('Best params: ', best_params)

Best params:  {'gamma': 0.1, 'kernel': 'rbf'}


#### Results

In [10]:
svc_rbf = SVC(**best_params)
svc_rbf.fit(X_train, y_train)
y_pred = svc_rbf.predict(X_test)

cm = sklearn.metrics.confusion_matrix(y_test, y_pred).transpose()
acc = sklearn.metrics.accuracy_score(y_test, y_pred)
precision, recall, _, _ = sklearn.metrics.precision_recall_fscore_support(y_test, y_pred)

print('Results:')
print('Confusion matrix')
show_metrics((cm, acc, precision, recall), ['Bad', 'Good'])

Results:
Confusion matrix


Unnamed: 0,Actual Bad,Actual Good
Predicted Bad,28,0
Predicted Good,8,70


Unnamed: 0,Avg.
Accuracy,0.924528


Unnamed: 0,Bad,Good
Precision,1.0,0.897436


Unnamed: 0,Bad,Good
Recall,0.777778,1.0


## Comparison & Conclusion
- Using other non-linear kernels can sometimes get better performance, since they can project the data into a higher space.

- Grid search provides a more convenient way to find the high-performance hyperparameter pair.

## Questions 
> Show the average performance of K-fold cross-validation of parameter search in tables for each kernel. 
### Polynomial kernel

In [11]:
arrays = [
    [j for sub in [[i]*3 for i in poly_parameters['coef0']] for j in sub],
    poly_parameters['degree']*3,
]
df1 = pd.DataFrame(test_score_poly.reshape(9, 3), index=arrays)
df1.index.names = ['coef0', 'degree']
df1.columns = poly_parameters['gamma']
df1.columns.names = ['gamma']
df1 = df1.style.set_caption("Score of polynomial kernel")
display(df1)

Unnamed: 0_level_0,gamma,0.1,0.5,1.0
coef0,degree,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,2,0.881633,0.910204,0.889796
0.1,3,0.889796,0.877551,0.840816
0.1,4,0.840816,0.853061,0.84898
1.0,2,0.865306,0.893878,0.885714
1.0,3,0.902041,0.861224,0.853061
1.0,4,0.897959,0.865306,0.840816
10.0,2,0.873469,0.873469,0.877551
10.0,3,0.873469,0.84898,0.840816
10.0,4,0.840816,0.836735,0.844898


### RBF kernel

In [12]:
df_grid_rbf = pd.DataFrame(test_score_rbf, index=rbf_parameters['gamma'])
df_grid_rbf.index.names = ['gamma']
df_grid_rbf.columns = ['Score of RBF kernel']
df_grid_rbf

Unnamed: 0_level_0,Score of RBF kernel
gamma,Unnamed: 1_level_1
0.1,0.959184
0.5,0.926531
1.0,0.914286
