## Importing the Libraries

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
import re 
import nltk 

## Importing the dataset

In [4]:
completeSpamAssassin = pd.read_csv('completeSpamAssassin.csv')
enronSpamSubset = pd.read_csv('enronSpamSubset.csv')
lingSpam = pd.read_csv('lingSpam.csv')
completeSpamAssassin.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


#### Dropping index columns

In [5]:
completeSpamAssassin.drop('Unnamed: 0',inplace=True,axis=1)
enronSpamSubset.drop(['Unnamed: 0','Unnamed: 0.1'],inplace=True,axis=1)
lingSpam.drop('Unnamed: 0',inplace=True,axis=1)

completeSpamAssassin.head()

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


#### Combining datasets

In [6]:
data = pd.concat([completeSpamAssassin, enronSpamSubset, lingSpam], axis=0)
data.head()

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18651 entries, 0 to 2604
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    18650 non-null  object
 1   Label   18651 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 437.1+ KB


#### Dropping any empty columns

From the above info, we notice that there is one NULL value which needs to be removed before data processing.

In [8]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18650 entries, 0 to 2604
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    18650 non-null  object
 1   Label   18650 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 437.1+ KB


#### Using list comprehension to remove links from the dataset 

In [9]:
remove_link = []
for body in data["Body"]:
    remove_link.append(re.sub(r'http\S+', '', body))

remove_link[1]

"1) Fight The Risk of Cancer!\n Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n Get the Child Support You Deserve - Free Legal Advice\n Join the Web's Fastest Growing Singles Community\n Start Your Private Photo Album Online!\n a Wonderful Day,\nOffer Manager\nPrizeMamaIf you wish to leave this list please use the link below.\n\n-- \nIrish Linux Users' Group: ilug@linux.ie\n for (un)subscription information.\nList maintainer: listmaster@linux.ie"

#### Removing special characters from the emails

In [10]:
clean_dataset = []
for body in remove_link:
    clean_dataset.append(re.sub('[^a-zA-Z0-9 ]', '', body))
    
clean_dataset[1]

'1 Fight The Risk of Cancer Slim Down  Guaranteed to lose 1012 lbs in 30 days Get the Child Support You Deserve  Free Legal Advice Join the Webs Fastest Growing Singles Community Start Your Private Photo Album Online a Wonderful DayOffer ManagerPrizeMamaIf you wish to leave this list please use the link below Irish Linux Users Group iluglinuxie for unsubscription informationList maintainer listmasterlinuxie'

#### To make text easier to analyse, we convert all characters to lowercase.

In [11]:
lower_dataset = []
for body in clean_dataset:
    lower_dataset.append(body.lower())
    
lower_dataset[1]

'1 fight the risk of cancer slim down  guaranteed to lose 1012 lbs in 30 days get the child support you deserve  free legal advice join the webs fastest growing singles community start your private photo album online a wonderful dayoffer managerprizemamaif you wish to leave this list please use the link below irish linux users group iluglinuxie for unsubscription informationlist maintainer listmasterlinuxie'

#### To clean the dataset further, we will need to split each email into a list of words. We can use the Natural Language Toolkit (NLTK) library

In [13]:
nltk.download('punkt')
tokens = []
for body in lower_dataset:
    tokens.append(nltk.word_tokenize(body))
    
tokens[0][:10]

[nltk_data] Downloading package punkt to /Users/eyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['save',
 'up',
 'to',
 '70',
 'on',
 'life',
 'insurancewhy',
 'spend',
 'more',
 'than']

Many words come in different forms such as plural and singular. Lemmaitization can be used which is defined as to "sort so as to group together inflected or variant forms of the same word." We can use the NLTK library again to lemmatize the words in the dataset

In [18]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

lemmatized = []
for word_list in tokens:
    temp_list = []
    for word in word_list:
        temp_list.append(lemma.lemmatize(word))
    lemmatized.append(temp_list)

[nltk_data] Downloading package wordnet to /Users/eyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Now we want to filter out the useless words that add no information to each email (e.g i, you, that, and, this). These are called stop words. Again the NLTK library can help us filter the dataset to remove stopwords.

These are the stopwords we will be using

In [20]:
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/eyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Filtering out the stopwords and lemmatizing the emails

In [21]:
without_stopwords = []
for word_list in lemmatized:
    temp_list = []
    for word in word_list:
        if word not in stopwords.words('english'):
            temp_list.append(word)
    without_stopwords.append(temp_list)
    
without_stopwords[0]

['save', '70', 'life', 'insurancewhy', 'spend', 'tolife', 'quote', 'savingsensuring', 'family', 'financial', 'security', 'important', 'life', 'quote', 'saving', 'make', 'buying', 'life', 'insurance', 'simple', 'affordable', 'provide', 'free', 'access', 'best', 'company', 'lowest', 'rateslife', 'quote', 'saving', 'fast', 'easy', 'save', 'money', 'let', 'u', 'help', 'get', 'started', 'best', 'value', 'country', 'new', 'coverage', 'save', 'hundred', 'even', 'thousand', 'dollar', 'requesting', 'free', 'quote', 'lifequote', 'saving', 'service', 'take', 'le', '5', 'minute', 'complete', 'shop', 'compare', 'save', '70', 'type', 'life', 'insurance', 'click', 'free', 'quoteprotecting', 'family', 'best', 'investment', 'youll', 'ever', 'makeif', 'receipt', 'email', 'error', 'andor', 'wish', 'removed', 'list', 'please', 'click', 'type', 'remove', 'reside', 'state', 'prohibits', 'email', 'solicitation', 'insurance', 'please', 'disregard', 'email']


#### Concatenating the cleaned text with the spam label column from the original dataset.

In [22]:
df = [' '.join(text) for text in without_stopwords]
df = pd.DataFrame(df)
label = pd.DataFrame(data["Label"])
df.reset_index(drop=True, inplace=True)
label.reset_index(drop=True, inplace=True)
clean = pd.concat([df, label], ignore_index=True, axis=1)
clean.rename(columns={0: 'Body', 1: 'Label'}, inplace=True)
clean

Unnamed: 0,Body,Label
0,save 70 life insurancewhy spend tolife quote s...,1
1,1 fight risk cancer slim guaranteed lose 1012 ...,1
2,1 fight risk cancer slim guaranteed lose 1012 ...,1
3,adult club offer free membership instant acces...,1
4,thought might like these1 slim guaranteed lose...,1
...,...,...
18645,subject computationally intensive method quant...,0
18646,subject book survey american linguistics publi...,0
18647,subject wecol 98 western conference linguistic...,0
18648,subject euralex 98 revised programme euralex 9...,0


#### Saving the cleaned dataset

In [63]:
clean.to_csv("cleaned_email_data.csv", sep=',', encoding='utf-8')