In [2]:
from os import walk
from os.path import join

import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup
import numpy as np

from sklearn.model_selection import train_test_split

%matplotlib inline

In [3]:
# Extracting email bodies:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
            
            stream = open(filepath, encoding='latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)
                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name, email_body

In [4]:
def df_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name)
        
    return pd.DataFrame(rows, index=row_names)

In [5]:
spam_1= "D:\Courses\Projects\Spam emails Classification\spam_assassin_corpus\spam_1"
spam_2 = "D:\Courses\Projects\Spam emails Classification\spam_assassin_corpus\spam_2"

spam_emails = df_from_directory(spam_1, 1)
spam_emails = spam_emails.append(df_from_directory(spam_2, 1))
spam_emails.head()

Unnamed: 0,MESSAGE,CATEGORY
00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1
00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00004.eac8de8d759b7e74154f142194282724,##############################################...,1
00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,1


In [6]:
ham_emails_1 = "D:\Courses\Projects\Spam emails Classification\spam_assassin_corpus\easy_ham_1"
ham_emails_2 = "D:\Courses\Projects\Spam emails Classification\spam_assassin_corpus\easy_ham_2"
ham_emails = df_from_directory(ham_emails_1, 0)
ham_emails = ham_emails.append(df_from_directory(ham_emails_2, 0))
ham_emails.shape

(3901, 2)

In [7]:
data = pd.concat([spam_emails, ham_emails])
print('Shape of entire dataframe is ', data.shape)
data.head()

Shape of entire dataframe is  (5799, 2)


Unnamed: 0,MESSAGE,CATEGORY
00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1
00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00004.eac8de8d759b7e74154f142194282724,##############################################...,1
00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,1


In [8]:
# check if any message bodies are null
data['MESSAGE'].isnull().values.any()

False

In [9]:
# check if there are empty emails (string length zero)
(data.MESSAGE.str.len() == 0).any()

True

In [10]:
(data.MESSAGE.str.len() == 0).sum()

3

In [11]:
data[data.MESSAGE.str.len() == 0].index

Index(['cmds', 'cmds', 'cmds'], dtype='object')

In [12]:
data.drop(['cmds'], inplace=True)

In [13]:
(data.MESSAGE.str.len() == 0).sum()

0

In [14]:
# Adding document IDs to track emails:

document_ids = range(0, len(data.index))
data['DOC_ID'] = document_ids

data['FILE_NAME'] = data.index  # file names were index so created a columns for file names before changing index to doc_ids
data.set_index('DOC_ID', inplace=True)
data.head()

Unnamed: 0_level_0,MESSAGE,CATEGORY,FILE_NAME
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1,00001.7848dde101aa985090474a91ec93fcf0
1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,##############################################...,1,00004.eac8de8d759b7e74154f142194282724
4,I thought you might like these:\n\n1) Slim Dow...,1,00005.57696a39d7d84318ce497886896bf90d


In [15]:
data.to_json("D:\Courses\Projects\Spam emails Classification\email-text-data.json")

## Text Pre-Processing:

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\netra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\netra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
def clean_msg(message, stemmer=PorterStemmer(), 
                 stop_words=set(stopwords.words('english'))):
    
    # Remove HTML tags
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    
    # Converts to Lower Case and splits up the words
    words = word_tokenize(cleaned_text.lower())
    
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    
    return filtered_words

In [19]:
%%time
nested_list = data.MESSAGE.apply(clean_msg)

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


Wall time: 34.7 s


In [49]:
nested_list.head()

DOC_ID
0    [save, life, insur, spend, life, quot, save, g...
1    [fight, risk, cancer, http, slim, guarante, lo...
2    [fight, risk, cancer, http, slim, guarante, lo...
3    [adult, club, offer, free, membership, instant...
4    [thought, might, like, slim, guarante, lose, l...
Name: MESSAGE, dtype: object

In [20]:
doc_ids_spam = data[data.CATEGORY == 1].index
doc_ids_ham = data[data.CATEGORY == 0].index

In [21]:
nested_list_ham = nested_list.loc[doc_ids_ham]
nested_list_spam = nested_list.loc[doc_ids_spam]

In [22]:
flat_list_ham = [item for sublist in nested_list_ham for item in sublist]
normal_words = pd.Series(flat_list_ham).value_counts()

normal_words.shape[0] # total number of unique words in the non-spam messages

20815

In [23]:
flat_list_spam = [item for sublist in nested_list_spam for item in sublist]
spammy_words = pd.Series(flat_list_spam).value_counts()

spammy_words.shape[0] # total number of unique words in the spam messages

13242

In [24]:
normal_words[:10]

http      7563
use       3633
list      2880
one       2373
get       2286
mail      2255
would     2003
like      1931
messag    1849
work      1800
dtype: int64

In [25]:
flat_nested_list = [item for sublist in nested_list for item in sublist]

In [26]:
unique_words = pd.Series(flat_nested_list).value_counts()
print('No. of unique words', unique_words.shape[0])
unique_words.tail()

No. of unique words 27334


removefromlist    1
lorri             1
offbeat           1
expressway        1
ict               1
dtype: int64

In [27]:
frequent_words = unique_words[0:2500]
print('Most common words: \n', frequent_words[:10])

Most common words: 
 http     10660
use       5019
list      4851
email     4367
get       4200
mail      3983
one       3907
free      3202
time      3042
work      2883
dtype: int64


In [28]:
word_ids = list(range(0, 2500))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index=word_ids)
vocab.index.name = 'WORD_ID'
vocab.head()

Unnamed: 0_level_0,VOCAB_WORD
WORD_ID,Unnamed: 1_level_1
0,http
1,use
2,list
3,email
4,get


In [35]:
pd.DataFrame(unique_words).to_csv(r"D:\Courses\Projects\Spam emails Classification\unique_words.csv", encoding = 'utf-8')

In [37]:
clean_email_lengths = []
for sublist in nested_list:
    clean_email_lengths.append(len(sublist))

In [39]:
max(clean_email_lengths) # Longest email length

7671

In [41]:
data.at[np.argmax(clean_email_lengths), 'MESSAGE']



In [43]:
word_columns_df = pd.DataFrame(nested_list.tolist())
word_columns_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
0,save,life,insur,spend,life,quot,save,g,famili,financi,...,,,,,,,,,,
1,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
2,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
3,adult,club,offer,free,membership,instant,access,site,user,name,...,,,,,,,,,,
4,thought,might,like,slim,guarante,lose,lb,day,http,fight,...,,,,,,,,,,


In [44]:
word_columns_df.shape

(5796, 7671)

## Train Test Split:

In [45]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, data.CATEGORY,
                                                   test_size=0.3, random_state=42)

In [46]:
print('No of training samples', X_train.shape[0])
print('Fraction of training set', X_train.shape[0] / word_columns_df.shape[0])

No of training samples 4057
Fraction of training set 0.6999654934437544


In [47]:
X_train.index.name = X_test.index.name = 'DOC_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4844,ye,inde,agent,directori,verita,cd,unix,subdirectori,file,call,...,,,,,,,,,,
4727,problem,come,tri,instal,harddissssk,like,alreadi,mount,http,yahoo,...,,,,,,,,,,
5022,origin,messag,date,mon,aug,chad,norwood,sven,cc,subject,...,,,,,,,,,,
3504,inlin,folk,sever,major,internet,outag,morn,across,major,provid,...,,,,,,,,,,
3921,url,http,date,bath,chronicl,,,,,,...,,,,,,,,,,


In [48]:
y_train.head()

DOC_ID
4844    0
4727    0
5022    0
3504    0
3921    0
Name: CATEGORY, dtype: int64

In [50]:
word_index = pd.Index(vocab.VOCAB_WORD)
type(word_index[3])

str

In [51]:
def make_sparse_matrix(df, indexed_words, labels):

    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'DOC_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
    
    return pd.DataFrame(dict_list)

In [52]:
%%time
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

Wall time: 12min 25s


In [53]:
train_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()
train_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
DOC_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,2,1,1
0,3,1,2
0,4,1,1
0,7,1,3
0,11,1,1


In [54]:
train_grouped = train_grouped.reset_index()
train_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,0,2,1,1
1,0,3,1,2
2,0,4,1,1
3,0,7,1,3
4,0,11,1,1


In [55]:
train_grouped.shape

(258371, 4)

In [57]:
np.savetxt(r"D:\Courses\Projects\Spam emails Classification\training-data.txt", train_grouped, fmt='%d')

In [58]:
%%time
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

Wall time: 4min 37s


In [59]:
test_grouped = sparse_test_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
test_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,8,2,1,1
1,8,3,1,4
2,8,4,1,2
3,8,5,1,1
4,8,6,1,2


In [60]:
np.savetxt(r"D:\Courses\Projects\Spam emails Classification\test-data.txt", test_grouped, fmt='%d')