In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [25]:
from funcs import SVM
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
import itertools
from sklearn.model_selection import KFold
from sklearn.svm import SVC

In [28]:
df = pd.read_csv('data/Letters_Q_O.csv')
columns = list(df.columns[:-1])

In [29]:
df[columns] = preprocessing.StandardScaler().fit_transform(df[columns])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1939671)

In [30]:
def process_df(df):
    X = np.array(df.drop(['letter'], axis=1))
    y_letter = np.array(df[['letter']])

    y = []
    for i in y_letter:
        if i[0] == 'Q':
            y.append(-1)
        else:
            y.append(1)
    y = np.array(y)
    return X, y

In [31]:
grid_C = [1, 5, 10, 20, 30, 40, 50]
grid_gamma = [1, 2, 3, 4, 5]
grid_kernel = ['polynomial']

iterables = [grid_C, grid_gamma, grid_kernel]
min_loss = 1000
k_fold = 5

kf5 = KFold(n_splits=k_fold, shuffle=False)

list_ = [['C', 'Gamma', 'Kernel', 'Validation Loss']]

for t in itertools.product(*iterables):
    
    val_loss = 0
    val_loss_sklearn = 0
    
    C = t[0]
    gamma = t[1]
    kernel = t[2]
    
    print('C: ', C)
    print('gamma: ', gamma)
    print('kernel: ', kernel)
    
    for train_index, test_index in kf5.split(train_df):
        
        X_, y_ = process_df(train_df.iloc[train_index])
        X_val, y_val = process_df(train_df.iloc[test_index])

        svm = SVM(X_, y_, C=C, gamma=gamma, kernel=kernel)
        svm.fit(tol=1e-5, fix_intercept=False)
        
        val_loss += svm.eval(X_val, y_val)
        
        # sklearn implementation
        clf = SVC(kernel = 'poly', degree=gamma, C=C)
        clf.fit(X_, y_)
        val_loss_sklearn += clf.score(X_val, y_val)
        
    val_loss = val_loss/k_fold
    val_loss_sklearn = val_loss_sklearn/k_fold
    print('Accuracy: ', val_loss)
    print('Accuracy Sklearn: ', val_loss_sklearn)
    print('=======================')
    print('')
    
    list_.append([C, gamma, kernel, val_loss])

C:  1
gamma:  1
kernel:  polynomial
Accuracy:  0.967219516475802
Accuracy Sklearn:  0.9634855864156904

C:  1
gamma:  2
kernel:  polynomial
Accuracy:  0.9812557588521784
Accuracy Sklearn:  0.8604887894344259

C:  1
gamma:  3
kernel:  polynomial
Accuracy:  0.9831380808213768
Accuracy Sklearn:  0.9728401562020095

C:  1
gamma:  4
kernel:  polynomial
Accuracy:  0.904444736957571
Accuracy Sklearn:  0.8183405730332147

C:  1
gamma:  5
kernel:  polynomial
Accuracy:  0.8707823263568952
Accuracy Sklearn:  0.8286165591680927

C:  5
gamma:  1
kernel:  polynomial
Accuracy:  0.9672107410820061
Accuracy Sklearn:  0.9634724233249967

C:  5
gamma:  2
kernel:  polynomial
Accuracy:  0.9812557588521784
Accuracy Sklearn:  0.9101092536527577

C:  5
gamma:  3
kernel:  polynomial
Accuracy:  0.9831380808213768
Accuracy Sklearn:  0.9812689219428721

C:  5
gamma:  4
kernel:  polynomial
Accuracy:  0.904444736957571
Accuracy Sklearn:  0.9091702865166076

C:  5
gamma:  5
kernel:  polynomial


KeyboardInterrupt: 

In [None]:
grid_C = [1, 5, 10, 20, 30, 40, 50]
grid_gamma = [0.01, 0.1, 0.2, 0.5, 0.75, 0.9]
grid_kernel = ['rbf']

iterables = [grid_C, grid_gamma, grid_kernel]
min_loss = 1000
k_fold = 5

kf5 = KFold(n_splits=k_fold, shuffle=False)

for t in itertools.product(*iterables):
    
    val_loss = 0
    val_loss_sklearn = 0
    
    C = t[0]
    gamma = t[1]
    kernel = t[2]
    
    print('C: ', C)
    print('gamma: ', gamma)
    print('kernel: ', kernel)
    
    for train_index, test_index in kf5.split(train_df):
        
        X_, y_ = process_df(train_df.iloc[train_index])
        X_val, y_val = process_df(train_df.iloc[test_index])

        svm = SVM(X_, y_, C=C, gamma=gamma, kernel=kernel)
        svm.fit(tol=1e-5, fix_intercept=False)
        
        # sklearn implementation
        clf = SVC(kernel = 'rbf', gamma=gamma, C=1/C)
        clf.fit(X_, y_)
        val_loss_sklearn += clf.score(X_val, y_val)
        
        val_loss += svm.eval(X_val, y_val)
        
    val_loss = val_loss/k_fold
    val_loss_sklearn = val_loss_sklearn/k_fold
    print('Accuracy: ', val_loss)
    print('Accuracy Sklearn: ', val_loss_sklearn)
    print('=======================')
    print('')
    
    list_.append([C, gamma, kernel, val_loss])

In [None]:
res_df = pd.DataFrame(list_)
res_df.to_csv('res.csv', index=False, header=False)

## MultiClass SVM

In [130]:
def convert_labels(list_, dict_):
    """
    List contains the predictions e.g. [-1, 1, 1, -1, ...]
    Dictionary contains the relation, e.g. {'-1': 'Q', '1': 'D'}
    """
    
    l = [dict_[i] for i in list_]
    
    return l

In [131]:
def voting(list_):
    """
    list_ is a list of lists
    """
    best = []
    l = list(map(list, zip(*list_)))
    for i in l:
        t = {'Q':0, 'O':0, 'D':0}
        for j in i:
            t[j] += 1
        best.append(max(t, key=t.get))
    return best

In [132]:
import copy

def process_df(df, letters=['Q', 'O'], test=False):
    df = df.copy()
    if test:
        X = np.array(df.drop(['letter'], axis=1))
        y_letter = np.array(df[['letter']])
        return X, np.array(y_letter)
    else:
        df = df[df.letter.isin(letters)]
        X = np.array(df.drop(['letter'], axis=1))
        y_letter = np.array(df[['letter']])

        y = []
        for i in y_letter:
            if i[0] == letters[0]:
                y.append(-1)
            else:
                y.append(1)
        y = np.array(y)
        return X, y

In [133]:
df_Q_O = pd.read_csv('data/Letters_Q_O.csv')
df_D = pd.read_csv('data/Letter_D.csv')
df = pd.concat([df_D, df_Q_O])

columns = list(df.columns[:-1])

df[columns] = preprocessing.StandardScaler().fit_transform(df[columns])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1939671)

In [134]:
C = 5
gamma = 0.1
kernel = 'rbf'

X_Q_O, y_Q_O = process_df(train_df, ['Q', 'O'])
X_Q_D, y_Q_D = process_df(train_df, ['Q', 'D'])
X_D_O, y_D_O = process_df(train_df, ['D', 'O'])

X_test, y_test = process_df(test_df, test=True)

In [135]:
SVM_Q_O = SVM(X_Q_O, y_Q_O, C=C, gamma=gamma, kernel=kernel)
SVM_Q_O.fit(tol=1e-5, fix_intercept=False)

SVM_Q_D = SVM(X_Q_D, y_Q_D, C=C, gamma=gamma, kernel=kernel)
SVM_Q_D.fit(tol=1e-5, fix_intercept=False)

SVM_D_O = SVM(X_D_O, y_D_O, C=C, gamma=gamma, kernel=kernel)
SVM_D_O.fit(tol=1e-5, fix_intercept=False)

In [136]:
l1 = convert_labels(SVM_Q_O.pred(X_test), {-1: 'Q', 1: 'O'})
l2 = convert_labels(SVM_Q_D.pred(X_test), {-1: 'Q', 1: 'D'})
l3 = convert_labels(SVM_D_O.pred(X_test), {-1: 'D', 1: 'O'})

l = [l1, l2, l3]
l = voting(l)

y_test = [item for sublist in y_test for item in sublist]

print('Accuracy: ', sum(np.array(y_test) == np.array(l))/len(y_test))

Accuracy:  0.9901960784313726
