# 1. SVM for classification

## 1.1 Dataset Preparation

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler 
np.random.seed(102)

In [9]:
def loadDataSet(dataset_path):
    data = pd.read_table(dataset_path, header=None)
    data_X, data_y = data.iloc[:, :-1], data.iloc[:, -1]
    data_X = np.array(data_X, dtype=np.float32)
    data_y = np.array(data_y, dtype=np.int64)
    data_y[data_y == 0] = -1
    return data_X, data_y

In [10]:
# read the data
X_train, y_train = loadDataSet("horseColicTraining.txt") 
X_test, y_test = loadDataSet("horseColicTest.txt")
# normalize
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 1.2 Implementation of SVM

We choose the Gaussian kernel (RBF kernel) for SVM. For any $x_i, x_j \in\mathbb{R}^d$, we define
$$
\kappa(x_i, x_j) = \exp \bigl( - \|x_i - x_j\|^2 / (2 \sigma^2) \bigr)
$$ 
where $\sigma > 0$ is the width of the Gaussian kernel.

In [11]:
class SVM():
    '''
    This is a class for support vector machine.
    
    The class contains the hyper-parameters such as $C$ and the kernel bandwidth $\sigma$. It also contains 
    the alpha vector, the tolerance for prediction error and the maximum number of iteration.
    
    It contains the functions for calculating the kernel matrix, fitting the model to estimate alpha and b 
    with SMO algorithm, making predictions and other fundamental functions.
    
    Attributes:
        C (positive number)         - the hyperparameter for SVM algorithm
        sigma (positive number)     - the kernel bandwidth $\sigma$ of Gaussian kernel 
        toler (positive number)     - the threshold value of prediction error. If the prediction error of 
                                      a sample is larger than this value, the corresponding alpha_i will be 
                                      probably updated.
        maxIter (positive integer)  - the maximum number of iteration to search a pair of alpha's to update
        alphas (vector, num_samples)- the alpha vector in the dual problem 
        b (number)                  - the bias b
    '''
    
    def __init__(self, C = 1, sigma = 1, toler = 1, maxIter = 10):
        self.C = C
        self.sigma = sigma
        self.toler = toler
        self.maxIter = maxIter
        self.alphas = 0
        self.b = 0
        
    def rbfkernel(self, X, Y):
        '''
        Calculate the kernel matrix whose (i,j)-th entry is $k(X[i,:], Y[j,:])$.
        '''
        m = X.shape[0]
        n = Y.shape[0]
        K = np.zeros(shape=(m, n))
        for i in range(m):
            for j in range(n):
                K[i, j] = np.sum((X[i,:] - Y[j,:])**2)
        K = np.exp(-K / (2 * self.sigma**2))
        return K

    def selectJrand(self, i, m):
        '''
        Randomly choose an index $j\neq i$ from 0 to m-1
        '''
        j = i 
        while (j == i):
            j = int(np.random.uniform(0, m))
        return j


    def clipAlpha(self, aj, H, L):
        '''
        Clip the vale aj by the lower bound L and upper bound H
        '''
        if aj > H:
            aj = H
        if L > aj:
            aj = L
        return aj


    def fit(self, X_train, y_train):
        '''
        estimate the alphas vector and bias in the SVM model
        
        Args: 
            X_train (matrix, num_train*num_features): features of training samples
            y_train (vector, num_train): label of training samples, each label is either -1 or 1
            
        Returns:
            self.b (a number)                 : the bias
            self.alphas (vector, num_features): the alpha vector 
        ''' 
        K_train = self.rbfkernel(X_train, X_train)
        self.X_train = X_train
        self.y_train = y_train
        m = K_train.shape[0]
        self.alphas = np.zeros((m, ))
        num_iter = 0
        while (num_iter < self.maxIter):
            alphaPairsChanged = 0
            # optimize for each data vector (with kernel trick)
            for i in range(m):   
                fXi = (self.alphas * y_train) @ K_train[i, :] + self.b
                # if checks if an example violates KKT conditions
                Ei = fXi - y_train[i]
                if ((y_train[i] * Ei < -self.toler) and (self.alphas[i] < self.C)) \
                       or ((y_train[i] * Ei > self.toler) and (self.alphas[i] > 0)):
                    j = self.selectJrand(i, m)
                    fXj = (self.alphas * y_train) @ K_train[j, :] + self.b
                    Ej = fXj - y_train[j]
                    alphaJold = self.alphas[j].copy()
                    alphaIold = self.alphas[i].copy()
                    if (y_train[j] != y_train[i]):
                        L = max(0, self.alphas[i] - self.alphas[j])
                        H = min(self.C, self.C + self.alphas[i] - self.alphas[j])
                    else:
                        L = max(0, self.alphas[i] + self.alphas[j] - self.C)
                        H = min(self.C, self.alphas[i] + self.alphas[j])
                    if L == H:
                        continue
                    eta = 2.0 * K_train[j, i] - K_train[j, j] - K_train[i, i]
                    if eta >= 0:
                        continue
                    self.alphas[i] += y_train[i] * (Ei - Ej) / eta
                    self.alphas[i] = self.clipAlpha(self.alphas[i], H, L)
                    if (abs(self.alphas[i] - alphaIold) < 0.00001):
                        continue
                    # update i by the same amount as j, the direction depends on y[i] and y[j]
                    self.alphas[j] += y_train[i] * y_train[j] * (alphaIold - self.alphas[i])
                    # update self.b
                    b1 = self.b - Ej - y_train[j] * (self.alphas[j] - alphaJold) * K_train[
                        j, j] - y_train[i] * (self.alphas[i] - alphaIold) * K_train[j, i]
                    b2 = self.b - Ei - y_train[j] * (self.alphas[j] - alphaJold) * K_train[
                        j, i] - y_train[i] * (self.alphas[i] - alphaIold) * K_train[i, i]
                    if (0 < self.alphas[j]) and (self.C > self.alphas[j]): self.b = b1
                    elif (0 < self.alphas[i]) and (self.C > self.alphas[i]): self.b = b2
                    else: self.b = (b1 + b2) / 2.0
                    alphaPairsChanged += 1
            if (alphaPairsChanged == 0): num_iter += 1
            else: num_iter = 0
        return self.b, self.alphas


    def predict(self,X_test):
        '''
        predict the label of test samples
        Args:
            X_test(matrix, num_test*num_features): features of test samples
        Returns:
            y_hat(vector, num_test): the predicted label of test samples, each label is either -1 or 1
        '''
        K_test = self.rbfkernel(self.X_train, X_test)
        f = K_test.T @ (self.alphas * self.y_train) + self.b
        y_hat = np.sign(f)
        return y_hat

## 1.3 Run SVM on the dataset

In [12]:
model_SVM = SVM(C = 0.5, sigma = 2, toler=0.001, maxIter=40)
b, alphas = model_SVM.fit(X_train, y_train)
y_test_hat = model_SVM.predict(X_test)
accuracy = np.mean(y_test_hat == y_test)
print("The accuracy of SVM is:", accuracy)

The accuracy of SVM is: 0.7761194029850746


## 2. Regression

## 2.1 Dataset Preparation

In [13]:
# Load data for ridge regression and lasso
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv("energydata_rv1.csv")
X = np.array(df.iloc[:,:-1])[:1000,:]
y = np.array(df.iloc[:,-1])[:1000]
# normalize
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

# split the train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)

## 2.2 Implementation of SVR

In [14]:
class SVR():
    '''
    This is a class for support vector regressor.
    
    The class contains the hyper-parameters such as $C$, $\epsilon$ and the kernel bandwidth $\sigma$. It also contains 
    the alpha vector, the tolerance for prediction error and the maximum number of iteration.
    
    It contains the functions for calculating the kernel matrix, fitting the model to estimate alpha, hat_alpha and b 
    with SMO algorithm, making predictions and other fundamental functions.
    
    Attributes:
        C (positive number)         - the hyperparameter for SVM algorithm
        sigma (positive number)     - the kernel bandwidth $\sigma$ of Gaussian kernel 
        toler (positive number)     - $\epsilon$, the threshold value of prediction error. If the prediction error of 
                                      a sample is larger than this value, the corresponding alphas[i] or hat_alpha[i]
                                      will be probably updated.
        maxIter (positive integer)  - the maximum number of iteration to search a pair of alpha's to update
        alphas (vector, num_samples) - the alpha vector in the dual problem 
        hat_alphas (vector, num_samples) - the hat_alpha vector in the dual problem 
        b (number)                  - the bias b
    '''
    
    def __init__(self, C = 1, sigma = 1, toler = 1, maxIter = 10):
        self.C = C
        self.sigma = sigma
        self.toler = toler
        self.maxIter = maxIter
        self.alphas = 0
        self.hat_alphas = 0
        self.b = 0
        
    def rbfkernel(self, X, Y):
        '''
        Calculate the kernel matrix whose (i,j)-th entry is $k(X[i,:], Y[j,:])$.
        '''
        m = X.shape[0]
        n = Y.shape[0]
        K = np.zeros(shape=(m, n))
        for i in range(m):
            for j in range(n):
                K[i, j] = np.sum((X[i,:] - Y[j,:])**2)
        K = np.exp(-K / (2 * self.sigma**2))
        return K

    def selectJrand(self, i, m):
        '''
        Randomly choose an index $j\neq i$ from 0 to m-1 and choose alpha[i] (or hat_alpha[i]) if hat_alpha[i] = 0 (or alpha[i]=0).
        '''
        j = i
        while (j == i):
            j = int(np.random.uniform(0, m))
        if self.alphas[j] == 0:
            choose_hat_j = 1
        else:
            choose_hat_j = 0
        return (j, choose_hat_j)


    def clipAlpha(self, aj, H, L):
        '''
        Clip the vale aj by the lower bound L and upper bound H
        '''
        if aj > H:
            aj = H
        if L > aj:
            aj = L
        return aj


    def fit(self, X_train, y_train):
        '''
        estimate the alphas vector, hat_alpha vector and bias in the SVM model
        
        Args: 
            X_train (matrix, num_train*num_features): features of training samples
            y_train (vector, num_train): output of training samples
            
        Returns:
            self.b (a number)                     : the bias
            self.alphas (vector, num_samples)     : the alpha vector 
            self.hat_alphas (vector, num_samples) : the hat_alpha vector 
        ''' 
        K_train = self.rbfkernel(X_train, X_train)
        self.X_train = X_train
        m = K_train.shape[0]
        self.alphas = np.zeros((m, ))
        self.hat_alphas = np.zeros((m, ))
        num_iter = 0
        while (num_iter < self.maxIter):
            alphaPairsChanged = 0
            # optimize for each data vector (with kernel trick)
            for i in range(m):  
                ### choose hat_alphas[i] or alphas[i] to update
                if self.hat_alphas[i] == 0:
                    choose_hat_i = 0
                else:
                    choose_hat_i = 1
                fXi = (self.hat_alphas - self.alphas) @ K_train[i, :] + self.b
                Ei = fXi - y_train[i]
                ### if statement checks if an example violates KKT conditions
                if (abs(Ei) < self.toler) or (self.alphas[i] == self.C) or (self.hat_alphas[i] == self.C):
                    continue
                else:
                    j, choose_hat_j = self.selectJrand(i, m)
                    fXj = (self.hat_alphas-self.alphas) @ K_train[j, :] + self.b
                    Ej = fXj - y_train[j]
                    alphaJold = self.alphas[j]
                    alphaIold = self.alphas[i]
                    hat_alphaJold = self.hat_alphas[j]
                    hat_alphaIold = self.hat_alphas[i]
                    if (choose_hat_i == 0) and (choose_hat_j == 1):
                        L = max(0, self.alphas[i] - self.hat_alphas[j])
                        H = min(self.C, self.C + self.alphas[i] - self.hat_alphas[j])
                    elif (choose_hat_i == 1) and (choose_hat_j == 0):
                        L = max(0, self.hat_alphas[i] - self.alphas[j])
                        H = min(self.C, self.C + self.hat_alphas[i] - self.alphas[j])
                    elif (choose_hat_i == 0) and (choose_hat_j == 0):
                        L = max(0, self.alphas[i] + self.alphas[j] - self.C)
                        H = min(self.C, self.alphas[i] + self.alphas[j])
                    else:
                        L = max(0, self.hat_alphas[i] + self.hat_alphas[j] - self.C)
                        H = min(self.C, self.hat_alphas[i] + self.hat_alphas[j])
                    if L == H:
                        continue
                    eta = 2.0 * K_train[j, i] - K_train[j, j] - K_train[i, i]
                    if eta >= 0:
                        continue
                    ### Update i 
                    if (choose_hat_i == 0) and (choose_hat_j == 1):
                        self.alphas[i] += (2 * self.toler - Ei + Ej) / eta
                    elif (choose_hat_i == 1) and (choose_hat_j == 0):
                        self.hat_alphas[i] += (2 * self.toler + Ei - Ej) / eta
                    elif (choose_hat_i == 0) and (choose_hat_j == 0):
                        self.alphas[i] += (-Ei + Ej) / eta
                    else:
                        self.hat_alphas[i] += (Ei - Ej) / eta
                    if choose_hat_i == 0:
                        self.alphas[i] = self.clipAlpha(self.alphas[i], H, L)
                    else:
                        self.hat_alphas[i] = self.clipAlpha(self.hat_alphas[i], H, L)
                    ### Whether the change amount is large enough or not
                    if choose_hat_i == 0:
                        if (abs(self.alphas[i] - alphaIold) < 0.00001):
                            continue
                    else:
                        if (abs(self.hat_alphas[i] - hat_alphaIold) < 0.00001):
                            continue
                    # update j by the same amount as i
                    if (choose_hat_i == 0) and (choose_hat_j == 1):
                        self.hat_alphas[j] += self.alphas[i] - alphaIold
                    elif (choose_hat_i == 1) and (choose_hat_j == 0):
                        self.alphas[j] += self.hat_alphas[i] - hat_alphaIold
                    elif (choose_hat_i == 0) and (choose_hat_j == 0):
                        self.alphas[j] += alphaIold - self.alphas[i]
                    else:
                        self.hat_alphas[j] += hat_alphaIold - self.hat_alphas[i]
                    # update b
                    b1 = self.b - Ej - (self.hat_alphas[j] - hat_alphaJold - (self.alphas[j] - alphaJold))    \
                        * K_train[j, j] - (self.hat_alphas[i] - hat_alphaIold - (self.alphas[i] - alphaIold)) * K_train[i, j]
                    b2 = self.b - Ei - (self.hat_alphas[j] - hat_alphaJold - (self.alphas[j] - alphaJold))   \
                        * K_train[j, i] - (self.hat_alphas[i] - hat_alphaIold - (self.alphas[i] - alphaIold)) * K_train[i, i]
                    if ((0 < self.alphas[j]) and (self.C > self.alphas[j])) or ((0 < self.hat_alphas[j]) and (self.C > self.hat_alphas[j])): 
                        self.b = b1
                    elif ((0 < self.alphas[i]) and (self.C > self.alphas[i])) or ((0 < self.hat_alphas[i]) and (self.C > self.hat_alphas[i])): 
                        self.b = b2
                    else: self.b = (b1 + b2) / 2.0
                    alphaPairsChanged += 1
            if (alphaPairsChanged == 0): 
                num_iter += 1
            else: 
                num_iter = 0
        return self.b, self.alphas, self.hat_alphas

    def predict(self, X_test):
        '''
        predict the label of test samples
        Args:
            X_test(matrix, num_test*num_features): features of test samples
        Returns:
            y_hat(vector, num_test): the predicted ouput of test samples
        '''
        K_test = self.rbfkernel(self.X_train, X_test)
        m = K_test.shape[0]
        y_hat = K_test.T @ (self.hat_alphas - self.alphas) + self.b
        return y_hat

## 1.3 Run SVR on the dataset

In [15]:
from sklearn.metrics import mean_squared_error
model_SVR = SVR(C = 0.5, sigma = 5, toler = 5, maxIter = 30)
b, alphas, hat_alphas = model_SVR.fit(X_train, y_train)
y_test_hat = model_SVR.predict(X_test)
mse = mean_squared_error(y_test_hat, y_test)
print("The MSE of SVR is:", mse)

The MSE of SVR is: 206.59778382914428
