# SVM for classification

## Dataset Preparation

In [41]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score
np.random.seed(102)

## Implementation of SVM

We choose the Gaussian kernel (RBF kernel) for SVM. For any $x_i, x_j \in\mathbb{R}^d$, we define
$$
\kappa(x_i, x_j) = \exp \bigl( - \|x_i - x_j\|^2 / (2 \sigma^2) \bigr)
$$ 
where $\sigma > 0$ is the width of the Gaussian kernel.

In [42]:
class SVM():
    '''
    This is a class for support vector machine.
    
    The class contains the hyper-parameters such as $C$ and the kernel bandwidth $\sigma$. It also contains 
    the alpha vector, the tolerance for prediction error and the maximum number of iteration.
    
    It contains the functions for calculating the kernel matrix, fitting the model to estimate alpha and b 
    with SMO algorithm, making predictions and other fundamental functions.
    
    Attributes:
        C (positive number)         - the hyperparameter for SVM algorithm
        sigma (positive number)     - the kernel bandwidth $\sigma$ of Gaussian kernel 
        toler (positive number)     - the threshold value of prediction error. If the prediction error of 
                                      a sample is larger than this value, the corresponding alpha_i will be 
                                      probably updated.
        maxIter (positive integer)  - the maximum number of iteration to search a pair of alpha's to update
        alphas (vector, num_samples)- the alpha vector in the dual problem 
        b (number)                  - the bias b
    '''
    
    def __init__(self, C = 1, sigma = 1, toler = 1, maxIter = 10):
        self.C = C
        self.sigma = sigma
        self.toler = toler
        self.maxIter = maxIter
        self.alphas = 0
        self.b = 0
        
    def rbfkernel(self, X, Y):
        '''
        Calculate the kernel matrix whose (i,j)-th entry is $k(X[i,:], Y[j,:])$.
        '''
        m = X.shape[0]
        n = Y.shape[0]
        K = np.zeros(shape=(m, n))
        for i in range(m):
            for j in range(n):
                K[i, j] = np.sum((X[i,:] - Y[j,:])**2)
        K = np.exp(-K / (2 * self.sigma**2))
        return K

    def selectJrand(self, i, m):
        '''
        Randomly choose an index $j\neq i$ from 0 to m-1
        '''
        j = i 
        while (j == i):
            j = int(np.random.uniform(0, m))
        return j


    def clipAlpha(self, aj, H, L):
        '''
        Clip the vale aj by the lower bound L and upper bound H
        '''
        if aj > H:
            aj = H
        if L > aj:
            aj = L
        return aj


    def fit(self, X_train, y_train):
        '''
        estimate the alphas vector and bias in the SVM model
        
        Args: 
            X_train (matrix, num_train*num_features): features of training samples
            y_train (vector, num_train): label of training samples, each label is either -1 or 1
            
        Returns:
            self.b (a number)                 : the bias
            self.alphas (vector, num_features): the alpha vector 
        ''' 
        K_train = self.rbfkernel(X_train, X_train)
        self.X_train = X_train
        self.y_train = y_train
        m = K_train.shape[0]
        self.alphas = np.zeros((m, ))
        num_iter = 0
        while (num_iter < self.maxIter):
            alphaPairsChanged = 0
            # optimize for each data vector (with kernel trick)
            for i in range(m):   
                fXi = (self.alphas * y_train) @ K_train[i, :] + self.b
                # if checks if an example violates KKT conditions
                Ei = fXi - y_train[i]
                if ((y_train[i] * Ei < -self.toler) and (self.alphas[i] < self.C)) \
                       or ((y_train[i] * Ei > self.toler) and (self.alphas[i] > 0)):
                    j = self.selectJrand(i, m)
                    fXj = (self.alphas * y_train) @ K_train[j, :] + self.b
                    Ej = fXj - y_train[j]
                    alphaJold = self.alphas[j].copy()
                    alphaIold = self.alphas[i].copy()
                    if (y_train[j] != y_train[i]):
                        L = max(0, self.alphas[i] - self.alphas[j])
                        H = min(self.C, self.C + self.alphas[i] - self.alphas[j])
                    else:
                        L = max(0, self.alphas[i] + self.alphas[j] - self.C)
                        H = min(self.C, self.alphas[i] + self.alphas[j])
                    if L == H:
                        continue
                    eta = 2.0 * K_train[j, i] - K_train[j, j] - K_train[i, i]
                    if eta >= 0:
                        continue
                    self.alphas[i] += y_train[i] * (Ei - Ej) / eta
                    self.alphas[i] = self.clipAlpha(self.alphas[i], H, L)
                    if (abs(self.alphas[i] - alphaIold) < 0.00001):
                        continue
                    # update i by the same amount as j, the direction depends on y[i] and y[j]
                    self.alphas[j] += y_train[i] * y_train[j] * (alphaIold - self.alphas[i])
                    # update self.b
                    b1 = self.b - Ej - y_train[j] * (self.alphas[j] - alphaJold) * K_train[
                        j, j] - y_train[i] * (self.alphas[i] - alphaIold) * K_train[j, i]
                    b2 = self.b - Ei - y_train[j] * (self.alphas[j] - alphaJold) * K_train[
                        j, i] - y_train[i] * (self.alphas[i] - alphaIold) * K_train[i, i]
                    if (0 < self.alphas[j]) and (self.C > self.alphas[j]): self.b = b1
                    elif (0 < self.alphas[i]) and (self.C > self.alphas[i]): self.b = b2
                    else: self.b = (b1 + b2) / 2.0
                    alphaPairsChanged += 1
            if (alphaPairsChanged == 0): num_iter += 1
            else: num_iter = 0
        return self.b, self.alphas


    def predict(self,X_test):
        '''
        predict the label of test samples
        Args:
            X_test(matrix, num_test*num_features): features of test samples
        Returns:
            y_hat(vector, num_test): the predicted label of test samples, each label is either -1 or 1
        '''
        K_test = self.rbfkernel(self.X_train, X_test)
        f = K_test.T @ (self.alphas * self.y_train) + self.b
        y_hat = np.sign(f)
        return y_hat

## Run SVM on the dataset

In [None]:
# def loadDataSet(dataset_path):
#     data = pd.read_table(dataset_path, header=None)
#     data_X, data_y = data.iloc[:, :-1], data.iloc[:, -1]
#     data_X = np.array(data_X, dtype=np.float32)
#     data_y = np.array(data_y, dtype=np.int64)
#     data_y[data_y == 0] = -1
#     return data_X, data_y

In [10]:
# # read the data
# X_train, y_train = loadDataSet("horseColicTraining.txt") 
# X_test, y_test = loadDataSet("horseColicTest.txt")
# # normalize
# scaler = MinMaxScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [43]:
# load datasets function
def load_data(data_file_name):
    data_dir = "..\..\..\data\data_classification"
    data_path = os.path.join(data_dir, data_file_name)
    df = pd.read_csv(data_path)
    data_X = df.iloc[:,:-1]
    data_y = df.iloc[:,-1]
    scaler_X = StandardScaler()
    data_X = scaler_X.fit_transform(data_X)
    data_y = pd.Categorical(data_y).codes.reshape(-1)
    return data_X, data_y

def main():

    # read dataset from csv file
    data_name = "messidor_classification"
    data_X, data_y = load_data("{}.csv".format(data_name))
    # pd.csv.split(5)

    # Train and test set
    kf = KFold(n_splits=10)
    res_list = []
    for train_index, test_index in kf.split(data_X):
        train_X, train_y = data_X[train_index,:], data_y[train_index]
        test_X, test_y = data_X[test_index,:], data_y[test_index]
    
    return train_X, train_y,test_X, test_y

In [44]:
def run():
    # running the model and getting the accuracy
    model_SVM = SVM(C = 0.5, sigma = 2, toler=0.001, maxIter=40)
    b, alphas = model_SVM.fit(main()[0], main()[1])
    y_test_hat = model_SVM.predict(main()[2])
    accuracy = np.mean(y_test_hat == main()[3])
    print("The accuracy of SVM is:", accuracy)

In [46]:
print(main()[0])
run()

[[ 5.90538595e-02  2.98212905e-01 -6.41486299e-01 ... -1.29476283e+00
  -4.68655678e-01  1.40504812e+00]
 [ 5.90538595e-02  2.98212905e-01 -5.63391135e-01 ... -8.21678582e-02
   2.00605415e+00 -7.11719399e-01]
 [ 5.90538595e-02  2.98212905e-01  9.20416985e-01 ...  2.74282645e-01
   1.12151640e+00 -7.11719399e-01]
 ...
 [ 5.90538595e-02  2.98212905e-01 -1.11005728e+00 ...  1.65190892e+00
  -1.05576440e+00  1.40504812e+00]
 [ 5.90538595e-02  2.98212905e-01  4.12798418e-01 ... -1.19858966e+00
  -3.96554271e-02 -7.11719399e-01]
 [ 5.90538595e-02  2.98212905e-01  2.95655672e-01 ...  1.20361803e-03
   7.03443253e-01  1.40504812e+00]]
The accuracy of SVM is: 0.043478260869565216
