In [615]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_colwidth', None)

In [616]:
email_df = pd.read_csv('spamassasin.csv')

In [617]:
email_df.shape

(6046, 3)

- Dropped one row due to body missing

In [618]:
email_df.isnull().sum()

Unnamed: 0    0
Body          1
Label         0
dtype: int64

In [619]:
email_df.dropna(subset=['Body'], inplace=True)
email_df.isnull().sum()

Unnamed: 0    0
Body          0
Label         0
dtype: int64

- Looking at the email body's

In [620]:
email_df['Body'].iloc[3]

"##################################################\r\n#                                                #\r\n#                 Adult Club                     #\r\n#           Offers FREE Membership               #\r\n#                                                #\r\n##################################################>>>>>  INSTANT ACCESS TO ALL SITES NOW\r\n>>>>>  Your User Name And Password is.\r\n>>>>>  User Name: zzzz@spamassassin.taint.org\r\n>>>>>  Password: 7603825 of the Best Adult Sites on the Internet for FREE!\r\n---------------------------------------\r\nNEWS 08/18/02\r\nWith just over 2.9 Million Members that signed up for FREE, Last month there were 721,184 New\r\nMembers. Are you one of them yet???\r\n---------------------------------------\r\nOur Membership FAQQ. Why are you offering free access to 5 adult membership sites for free?\r\nA. I have advertisers that pay me for ad space so you don't have to pay for membership.Q. Is it true my membership is for life?\r\nA. 

- Remove special characters

In [621]:
email_df['Label'].value_counts()
email_df['Body'] = email_df['Body'].str.lower()
#email_df['Body'] = email_df['Body'].str.replace(r'[^a-zA-Z]', '', regex=True)

In [622]:
vectorizer = CountVectorizer()

In [623]:
spam = email_df[email_df['Label'] == 1]
spam = spam['Body'].tolist()

ham = email_df[email_df['Label'] == 0]
ham = ham['Body'].tolist()

In [624]:
vectorizer = vectorizer.fit(spam)
vocab_series = pd.Series(vectorizer.vocabulary_)

vocab_series = vocab_series.reset_index(name='counts')
vocab_series[vocab_series['index'] != r'[^a-zA-Z]'].head(5)

Unnamed: 0,index,counts
0,save,21812
1,up,25152
2,to,24307
3,70,2646
4,on,18194


In [625]:
X = vectorizer.transform(email_df['Body'])
y = email_df['Label']
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.2, train_size= 0.8)

In [626]:
svm = SVC()

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5)

best_svm = SVC(C=grid_search.best_params_['C'],
               gamma=grid_search.best_params_['gamma'])
               
best_svm.fit(X_train, y_train)

y_pred = best_svm.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)

In [627]:
confusion_matrix

array([[3107,  211],
       [ 141, 1377]], dtype=int64)

- 3107 True positve
- 1377 True negative
- 141 False negative
- 211 False positve

- Overall does a pretty decent job at detecting whether or not an email is spam

In [628]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'accuracy: {accuracy}, precision: {precision}, recall: {recall}')

accuracy: 0.9272125723738627, precision: 0.8671284634760705, recall: 0.9071146245059288
