# SVM: Spam email detection system

## Step 1: Loading the dataset

In [5]:
import pandas as pd

total_data = pd.read_csv("../data/emails.csv")
total_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [6]:
print(total_data.shape)
total_data = total_data.drop_duplicates()
total_data = total_data.reset_index(inplace = False, drop = True)
total_data.shape

(5728, 2)


(5695, 2)

In [7]:
print(f"Spam: {len(total_data.loc[total_data.spam == 1])}")
print(f"No spam: {len(total_data.loc[total_data.spam == 0])}")

Spam: 1368
No spam: 4327


In [8]:
import regex as re

def preprocess_text(text):
    # Remove any character that is not a letter (a-z) or white space ( )
    text = re.sub(r'[^a-z ]', " ", text)
    
    # Remove white spaces
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Multiple white spaces into one
    text = re.sub(r'\s+', " ", text.lower())

    # Remove tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

    return text.split()

total_data["text"] = total_data["text"].apply(preprocess_text)
total_data.head()

Unnamed: 0,text,spam
0,"[ubject, naturally, irresistible, your, corpor...",1
1,"[ubject, the, stock, trading, gunslinger, fann...",1
2,"[ubject, unbelievable, new, homes, made, easy,...",1
3,"[ubject, color, printing, special, request, ad...",1
4,"[ubject, do, not, have, money, get, software, ...",1


In [9]:
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
download("wordnet")
lemmatizer = WordNetLemmatizer()

download("stopwords")
stop_words = stopwords.words("english")

def lemmatize_text(words, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

total_data["text"] = total_data["text"].apply(lemmatize_text)
total_data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,spam
0,"[ubject, naturally, irresistible, corporate, i...",1
1,"[ubject, stock, trading, gunslinger, fanny, me...",1
2,"[ubject, unbelievable, home, made, easy, wanti...",1
3,"[ubject, color, printing, special, request, ad...",1
4,"[ubject, money, software, software, compatibil...",1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokens_list = total_data["text"]
tokens_list = [" ".join(tokens) for tokens in tokens_list]

vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = total_data["spam"]

X[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
import pickle

with open('../models/tfidf.sav', 'wb') as pckl:
    pickle.dump(vectorizer, pckl)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [13]:
from sklearn.svm import SVC

model = SVC(kernel = "linear", random_state = 42)
model.fit(X_train, y_train)

In [14]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = model.predict(X_train)
accuracy_score(y_train, y_pred)
f1_score(y_train, y_pred)

0.9990662931839402

In [15]:
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)

0.9829351535836178

In [16]:
with open('../models/SVM.sav', 'wb') as pkl:
    pickle.dump(model, pkl)

In [17]:
import numpy as np
from sklearn.model_selection import GridSearchCV

hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed’"],
    "degree": [1, 2, 3, 4, 5],
    "gamma": ["scale", "auto"]
}

# We initialize the random search
grid = GridSearchCV(model, hyperparams, scoring = "f1", cv = 5)
grid