# Support Vector Machine

## Abstract
This notebook first trains a Support Vector Classifiers and a Linear Support Vector Classifier.

LinearSVC's performance was relatively poor, which is to be expected given the large number of features. 

The final model is based on a support vector classifer with following features:

## Import and Settings

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from joblib import dump, load

In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
import dask
import dask.dataframe as dd
import dask.array as da

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = dd.read_csv('prepared_ds.csv')

## Initialise Sample Dataframe

In [5]:
df_fast = df.sample(frac=0.01, random_state = 1).compute()

## Train-Test-Split

In [9]:
train, test, train_lbl, test_lbl = train_test_split(df_fast.loc[:, ~df_fast.columns.isin(['Unnamed: 0', "Label"])], df_fast["Label"], test_size=0.2, random_state=1)
X = df_fast.loc[:, ~df_fast.columns.isin(['Unnamed: 0', "Label"])]
y = df_fast["Label"]

## Models

### SVM

In [10]:
# Default SVC
clf = svm.SVC()
clf.fit(train, train_lbl)
score = clf.score(test, test_lbl)
score

0.9916749256689792

In [None]:
param_distributions = {
    "kernel" : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    "gamma" : ['scale', 'auto'],
    "degree" : list(range(1, 15)),
    "gamma" : [True, False]
}

svc = svm.SVC()

# run random grid search
grid_search_ABC = RandomizedSearchCV(svc, param_distributions=param_distributions, n_iter=4)
grid_search_ABC.fit(df_fast.loc[:, ~df_fast.columns.isin(['Unnamed: 0', "Label"])], df_fast["Label"])
pd.DataFrame.from_dict(grid_search_ABC.cv_results_)

#### Training the best model on the whole dataset
kernel: rbf, gamma: auto, degree: 9, shrinking: False

In [84]:
clf = svm.SVC(kernel="rbf", gamma="auto", degree="9")
clf.fit(train, train_lbl)
score = clf.score(test, test_lbl)
print("Score:", score)
print("Confusion Matrix")
confusion_matrix(test_lbl, clf.predict(test))
dump(clf, 'models/svc_9919392137429799.model') 

Score: 0.9919392137429799
Confusion Matrix


['models/svc_9919392137429799.joblib']

### LinearSVC

In [35]:
clf = LinearSVC()
clf.fit(train, train_lbl)
score = clf.score(test, test_lbl)
score

0.9802371541501976

In [52]:
penalty = ["l1", "l2"]
loss = ["hinge", "squared_hinge"]
dual = [True, False]
tol = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]

In [82]:
scores = []
for p, l, d, t in list(itertools.product(penalty, loss, dual, tol)):
    try:    
        clf = svm.LinearSVC(penalty=p, loss = l, dual = d, tol = t)
        clf.fit(train2, train_lbl2)
        score = clf.score(test2, test_lbl2)
    except:
        score = -1
    s = f"penalty: {p}, loss: {l}, dual: {d}, tol: {t}"
    scores.append([score, s])

In [55]:
scores.sort(reverse = True)
print(len(scores))
for score, string in scores[:10]:
    print(score, string)

40
0.9808959156785244 penalty: l1, loss: squared_hinge, dual: False, tol: 0.01
0.9802371541501976 penalty: l2, loss: squared_hinge, dual: True, tol: 1e-06
0.9802371541501976 penalty: l2, loss: squared_hinge, dual: True, tol: 1e-05
0.9802371541501976 penalty: l2, loss: squared_hinge, dual: True, tol: 0.01
0.9802371541501976 penalty: l2, loss: squared_hinge, dual: True, tol: 0.001
0.9802371541501976 penalty: l2, loss: squared_hinge, dual: True, tol: 0.0001
0.9795783926218709 penalty: l2, loss: squared_hinge, dual: False, tol: 1e-06
0.9795783926218709 penalty: l2, loss: squared_hinge, dual: False, tol: 1e-05
0.9795783926218709 penalty: l2, loss: squared_hinge, dual: False, tol: 0.001
0.9795783926218709 penalty: l2, loss: squared_hinge, dual: False, tol: 0.0001


### Top
0.9808959156785244 penalty: l1, loss: squared_hinge, dual: False, tol: 0.01

In [89]:
clf = svm.LinearSVC(penalty="l1", loss = "squared_hinge", dual = False, tol = 0.01)
clf.fit(train2, train_lbl2)
score = clf.score(test2, test_lbl2)
print("Score:", score)
print("Confusion Matrix")
confusion_matrix(test_lbl2, clf.predict(test2))
dump(clf, 'models/linear_svc_9777998017839445.model') 

Score: 0.9777337297654444
Confusion Matrix


['models/linear_svc_9777998017839445.joblib']