# SVM Classifier

## 1.1 Import Python libraries and Dataset

### We will use SVM modules from the sklearn library. I will also import the 'train_test_split' from the module_selection package from sklearn, and numpy and pandas for data analysis

In [133]:
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

### First, we imported the dataset from the file named 'spambase.data' and readjusted it by importing column names and adding them as the column names for the dataframe

In [89]:
data = np.genfromtxt('spambase.data', delimiter=',')
df = pd.DataFrame(data)
names = pd.read_csv('spamnames.txt')
df.columns = names.columns
df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61.0,278.0,1.0
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101.0,1028.0,1.0
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485.0,2259.0,1.0
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40.0,191.0,1.0
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40.0,191.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3.0,88.0,0.0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4.0,14.0,0.0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6.0,118.0,0.0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5.0,78.0,0.0


## 1.2 Linear Kernel

### We will begin by using the 'train_test_split' function to split the data into training and testing sets for both the predictor variables (X) and the response variable (y)

In [130]:
#Let X be the predictor variables & y be the response variable
X = df.iloc[:,0:57]
y = df['is_spam']
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42)
#Use the scale function on predictor variables (since SVM takes in scaled datasets as input)
X_test_scaled = scale(X_test)
X_train_scaled = scale(X_train)

### Next, I will use a linear kernel for my SVM model, and I will use 'accuracy_score' to determine the accuracy of the SVM model. 

In [131]:
linear_clf_svm = svm.SVC(kernel='linear')
linear_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = linear_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.9225199131064447

### The accuracy score was 0.92, which indicates that the SVM model is relatively accurate in making spam predictions, I will now use GridSearchCV to find the optimal value for C for the SVM model with a linear kernel

In [139]:
param_grid = {'C': [0.5,1,10,100]}
optimal_parameters = GridSearchCV(SVC(), param_grid, scoring='accuracy')
optimal_parameters.fit(X_train_scaled, y_train)
optimal_parameters.best_params_

{'C': 1}

### We find out that the optimal C value is 1, then we plug in the optimal c value in our SVM classifier

In [151]:
linear_clf_svm = svm.SVC(kernel='linear', C=1)
linear_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = linear_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.9225199131064447

### We get an accuracy score that is identical to the former, since SVMs are fairly accurate in their predictions and the accuracy score was quite large to begin with

## 1.3 Quadratic Kernel

In [143]:
quadratic_clf_svm = svm.SVC(kernel='poly')
quadratic_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = quadratic_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.7545257060101376

### The accuracy score is 0.75, while it is not a low score, it is much lower than the accuracy of the SVM that has a linear kernal, next, we will have to find an optimal C, which might increase the accuracy score

In [144]:
param_grid = {'C': np.arange( 1, 100+1, 1 ).tolist()}
optimal_parameters = GridSearchCV(SVC(), param_grid, scoring='accuracy')
optimal_parameters.fit(X_train_scaled, y_train)
optimal_parameters.best_params_

{'C': 2}

In [152]:
quadratic_clf_svm = svm.SVC(kernel='poly', C=2)
quadratic_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = quadratic_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.7892831281679942

### when we used the optimal c value obtained (c=2), we get an imporved accuracy score of 0.79

In [153]:
quadratic_clf_svm = svm.SVC(kernel='poly', C=50)
quadratic_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = quadratic_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.9065894279507604

### However, when the c value is increased (to c=50), the accuracy score increases significantly, becoming 0.907

## 1.4 Radial Basis Function Kernel

In [154]:
quadratic_clf_svm = svm.SVC(kernel='rbf')
quadratic_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = quadratic_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.9362780593772628

### The accuracy score obtained with a radial basis function kernel is 0.936, which makes rbf the kernel with the largest svm score in comparison to the former two

In [159]:
quadratic_clf_svm = svm.SVC(kernel='rbf', C=10)
quadratic_clf_svm.fit(X_train_scaled, y_train)

predicted_clf_svm = quadratic_clf_svm.predict(X_test_scaled)
accuracy_score(y_test, predicted_clf_svm)

0.939898624185373

### When we select C=10, the accuracy score increases slightly (from 0.936 to 0.94), but overall, the accuracy score is very high, regardless of the C value

In [161]:
from tabulate import tabulate
print(tabulate([['Linear', 1], ['Quadratic', 50], ['RBF', 10]], headers=['Kernel', 'C-value']))


Kernel       C-value
---------  ---------
Linear             1
Quadratic         50
RBF               10
