# Imports

In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

# SVM 

* My implementation of SVM from scratch is based on PEGASUS algorithm whose pseudocode is in the uploaded paper "PegasosMPB.pdf".
* Using PEGASUS the learning rate is calculated based on regularization parameter and the epoch number
* To ease calculations for gradient I used the approximation used by Christopher Bishop "Pattern Recognition and Machine Learning". Where, the b (intercept/bias) is added to the w (weight matrix), and a column of 1s is added to the data.
* The lamba is the reqularization cofficient and it relates to how many missclassifications are allowed for the model i.e. soft or hard margin. A small lamba means a soft margin (more misclassifications allowed) which translates to a better generalization on the testing data
* The algorithm uses gradient descent to calculate the weights, stopping criteria is number of epochs

In [315]:
class SVM:
    def __init__(self):
        self.x = None
        self.y = None
        self.T = 0
        pass
    
#     def cost_fn(self):
#         x = self.x
#         y = self.y
#         w = self.w
#         z = x.dot(w)
#         lx = y * z
#         idx = np.where(lx >= 1) #max(0,lx)
#         y[idx] = 0
#         f = np.matmul(np.transpose(y),x)
#         cost = 1/len(y) * f
#         return cost

#     def fit(self, data, target, epochs, lamba):
        
#         x = data
#         x.insert(loc=len(x.columns), column='intercept', value=1)
#         self.x = x.to_numpy()
#         self.y = target.to_numpy()
#         self.T = epochs 
#         self.w = np.zeros(self.x.shape[1])
#         for t in range(self.T + 1):
#             t = t + 1
#             lr = 1/(lamba*t)
#             grad = self.cost_fn()
#             self.w = self.w * (1-lr*lamba) + lr*grad
#         return self.w
    
    def cost_fn(self,w, x, y):

        x = np.array([x])
        y = np.array([y])
        z = np.dot(w,np.transpose(x))
        lx = y * z
        if lx < 1 : #max(0,lx)
            v = y*x
        else:
            v = 0
        return v
    
    def fit(self, data, target, epochs, lamba):
        
        x = data
        x.insert(loc=len(x.columns), column='intercept', value=1)
        self.x = x.to_numpy()
        self.y = target.to_numpy()
        self.T = epochs 
        w = np.zeros(self.x.shape[1])
        for t in range(self.T + 1):
            t = t + 1
            lr = 1/(lamba*t)
            for ind, x in enumerate(self.x):
                grad = self.cost_fn(w, x, self.y[ind])
                w = w * (1-lr*lamba) + lr*grad
        return w
    
    
    def predict(self,xtest, w):
        self.w =w
        xtest.insert(loc=len(xtest.columns), column='intercept', value=1)

        ytestpred = []
        for i in range(xtest.shape[0]):
            ypred = np.sign(np.dot(xtest.to_numpy()[i], np.transpose(self.w)))
            ytestpred = np.append(ytestpred, ypred)
        return ytestpred

# Dummy data to check SVM

In [274]:
data = pd.read_csv('data.csv')
diagnosis_map = {'M':1, 'B':-1}
data['diagnosis'] = data['diagnosis'].map(diagnosis_map)
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [290]:
Y = data.loc[:, 'diagnosis']
X = data.iloc[:, 1:]

In [327]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=109) # 70% training and 30% test

In [328]:
v = SVM()
W = v.fit(X_train, y_train, 5000, 0.0000001)

In [329]:
ypred = v.predict(X_test, W)

In [330]:
print("Accuracy:",metrics.accuracy_score(y_test, ypred))

Accuracy: 0.935672514619883
