# SVM

##### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import  shuffle

#### Model Training

In [3]:
def compute_cost(W,X,Y):
    N=X.shape[0]
    distances=1-Y*(np.dot(X,W))
    distances[distances<0]=0 #setting distance less than 0 to 0
    hinge_loss=regularization_strength*(np.sum(distances)/N)
    #cost
    cost=1/2*np.dot(W,W)+hinge_loss
    return cost

#### Cost Gradient

In [4]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array
    distance= 1-(Y_batch*np.dot(X_batch,W))
    dw=np.zeros(len(W))
    for ind,d in enumerate(distance):
        if max(0,d)==0:
            di=W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw+=di
        
    dw=dw/len(Y_batch) #average
    return dw

#### Gradient Descent

In [5]:
def sgd(features,outputs):
    max_epochs=5000
    weights=np.zeros(features.shape[1])
    nth=0
    prev_cost=float("inf")
    cost_threshold=0.01 #if cost didnt reduce than this then gradients descent stops
    # Stochastic Gradient Descent
    for epoch in range(1,max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights            

In [6]:
def init():
    # read data in pandas (pd) data frame
    data = pd.read_csv('./data.csv')

    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

    print("applying feature engineering...")
#     # convert categorical labels to numbers
    diag_map = {'M': 1.0, 'B': -1.0}
    data['diagnosis'] = data['diagnosis'].map(diag_map)

#     # put features & outputs in different data frames
    Y = data.loc[:, 'diagnosis']
    X = data.iloc[:, 1:]


    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(X.values)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))

    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)

    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


# set hyper-parameters and call init
regularization_strength = 10000
learning_rate = 0.000001
init()

applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 5299.811766285014
Epoch is: 2 and Cost is: 3432.5237285774215
Epoch is: 4 and Cost is: 2575.9757246872255
Epoch is: 8 and Cost is: 1878.0861436040977
Epoch is: 16 and Cost is: 1517.605023904006
Epoch is: 32 and Cost is: 1209.2819971388546
Epoch is: 64 and Cost is: 960.1610719880528
Epoch is: 128 and Cost is: 803.0949163360799
Epoch is: 256 and Cost is: 701.2496681676216
Epoch is: 512 and Cost is: 652.1509196853154
Epoch is: 1024 and Cost is: 624.8175291207651
Epoch is: 2048 and Cost is: 612.4443187133793
Epoch is: 4096 and Cost is: 605.0865411077752
Epoch is: 4999 and Cost is: 604.4464847297797
training finished.
weights are: [ 1.33861286  0.84401111  1.1356027   2.17092534 -1.23583805 -3.23432162
  3.28187145  6.8282615  -0.44170528  0.11476862  5.67944927 -1.91044269
  3.26544466  3.76707881  1.66022569 -2.43126177 -1.75177105  0.81739529
 -1.96816379 -1.85807654