In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [51]:
from funcs import SVM
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
import itertools
from sklearn.model_selection import KFold
from sklearn.svm import SVC

In [52]:
df = pd.read_csv('data/Letters_Q_O.csv')

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1939671)

In [53]:
def process_df(df):
    X = np.array(df.drop(['letter'], axis=1))
    y_letter = np.array(df[['letter']])

    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)

    y = []
    for i in y_letter:
        if i[0] == 'Q':
            y.append(-1)
        else:
            y.append(1)
    y = np.array(y)
    return X, y

In [71]:
grid_C = [1, 5, 10, 20, 30, 40, 50]
grid_gamma = [1, 2, 3, 4, 5]
grid_kernel = ['polynomial']

iterables = [grid_C, grid_gamma, grid_kernel]
min_loss = 1000
k_fold = 5

kf5 = KFold(n_splits=k_fold, shuffle=False)

list_ = [['C', 'Gamma', 'Kernel', 'Validation Loss']]

for t in itertools.product(*iterables):
    
    val_loss = 0
    val_loss_sklearn = 0
    
    C = t[0]
    gamma = t[1]
    kernel = t[2]
    
    print('C: ', C)
    print('gamma: ', gamma)
    print('kernel: ', kernel)
    
    for train_index, test_index in kf5.split(train_df):
        
        X_, y_ = process_df(train_df.iloc[train_index])
        X_val, y_val = process_df(train_df.iloc[test_index])

        svm = SVM(X_, y_, C=C, gamma=gamma, kernel=kernel)
        svm.fit(tol=1e-5, fix_intercept=False)
        
        val_loss += svm.eval(X_val, y_val)
        
        # sklearn implementation
        clf = SVC(kernel = 'poly', degree=gamma, C=C)
        clf.fit(X_, y_)
        val_loss_sklearn += clf.score(X_val, y_val)
        
    val_loss = val_loss/k_fold
    val_loss_sklearn = val_loss_sklearn/k_fold
    print('Accuracy: ', val_loss)
    print('Accuracy Sklearn: ', val_loss_sklearn)
    print('=======================')
    print('')
    
    list_.append([C, gamma, kernel, val_loss])

C:  1
gamma:  1
kernel:  polynomial
Accuracy:  0.9643982273704534
Accuracy Sklearn:  0.9625334561888466

C:  1
gamma:  2
kernel:  polynomial
Accuracy:  0.9718924136720636
Accuracy Sklearn:  0.8623535606160327

C:  1
gamma:  3
kernel:  polynomial
Accuracy:  0.9719011890658594
Accuracy Sklearn:  0.9672063533851082

C:  1
gamma:  4
kernel:  polynomial
Accuracy:  0.8988504234127506
Accuracy Sklearn:  0.8258172085472335

C:  1
gamma:  5
kernel:  polynomial
Accuracy:  0.8614189811767803
Accuracy Sklearn:  0.8360800315914177

C:  5
gamma:  1
kernel:  polynomial
Accuracy:  0.9653371945066036
Accuracy Sklearn:  0.9672019656882103

C:  5
gamma:  2
kernel:  polynomial
Accuracy:  0.9709534465359134
Accuracy Sklearn:  0.8979421701548856

C:  5
gamma:  3
kernel:  polynomial
Accuracy:  0.9719011890658594
Accuracy Sklearn:  0.974722478171208

C:  5
gamma:  4
kernel:  polynomial
Accuracy:  0.8988504234127506
Accuracy Sklearn:  0.9138344083190735

C:  5
gamma:  5
kernel:  polynomial
Accuracy:  0.8614189

KeyboardInterrupt: 

In [72]:
grid_C = [1, 5, 10, 20, 30, 40, 50]
grid_gamma = [0.01, 0.1, 0.2, 0.5, 0.75, 0.9]
grid_kernel = ['rbf']

iterables = [grid_C, grid_gamma, grid_kernel]
min_loss = 1000
k_fold = 5

kf5 = KFold(n_splits=k_fold, shuffle=False)

for t in itertools.product(*iterables):
    
    val_loss = 0
    val_loss_sklearn = 0
    
    C = t[0]
    gamma = t[1]
    kernel = t[2]
    
    print('C: ', C)
    print('gamma: ', gamma)
    print('kernel: ', kernel)
    
    for train_index, test_index in kf5.split(train_df):
        
        X_, y_ = process_df(train_df.iloc[train_index])
        X_val, y_val = process_df(train_df.iloc[test_index])

        svm = SVM(X_, y_, C=C, gamma=gamma, kernel=kernel)
        svm.fit(tol=1e-5, fix_intercept=False)
        
        # sklearn implementation
        clf = SVC(kernel = 'rbf', gamma=gamma, C=1/C)
        clf.fit(X_, y_)
        val_loss_sklearn += clf.score(X_val, y_val)
        
        val_loss += svm.eval(X_val, y_val)
        
    val_loss = val_loss/k_fold
    val_loss_sklearn = val_loss_sklearn/k_fold
    print('Accuracy: ', val_loss)
    print('Accuracy Sklearn: ', val_loss_sklearn)
    print('=======================')
    print('')
    
    list_.append([C, gamma, kernel, val_loss])

C:  1
gamma:  0.01
kernel:  rbf
Accuracy:  0.9616076521433901
Accuracy Sklearn:  0.9559914001140802

C:  1
gamma:  0.1
kernel:  rbf
Accuracy:  0.9878329165021281
Accuracy Sklearn:  0.9878329165021281

C:  1
gamma:  0.2
kernel:  rbf
Accuracy:  0.9868939493659779
Accuracy Sklearn:  0.9859549822298277

C:  1
gamma:  0.5
kernel:  rbf
Accuracy:  0.9616295906278796
Accuracy Sklearn:  0.9616295906278796

C:  1
gamma:  0.75
kernel:  rbf
Accuracy:  0.9270194374972578
Accuracy Sklearn:  0.9242113114826027

C:  1
gamma:  0.9
kernel:  rbf
Accuracy:  0.897069018472204
Accuracy Sklearn:  0.8877056732920892

C:  5
gamma:  0.01
kernel:  rbf
Accuracy:  0.9775174410951692
Accuracy Sklearn:  0.9306919398007985

C:  5
gamma:  0.1
kernel:  rbf
Accuracy:  0.990636654819885
Accuracy Sklearn:  0.9672239041726997

C:  5
gamma:  0.2
kernel:  rbf
Accuracy:  0.9859549822298275
Accuracy Sklearn:  0.9550831468562151

C:  5
gamma:  0.5
kernel:  rbf
Accuracy:  0.9644333289456364
Accuracy Sklearn:  0.8258874116975999


KeyboardInterrupt: 

In [244]:
res_df = pd.DataFrame(list_)
res_df.to_csv('res.csv', index=False, header=False)