# Notebook imports

In [1]:
from os import walk    #to use walk from system os
from os.path import join

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline


In [2]:
import nltk 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup

import numpy as np

In [3]:
from sklearn.model_selection import train_test_split

# Constants

In [46]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

spam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
spam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
easy_nonspam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
easy_nonspam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

SPAM_CAT = 1
HAM_CAT = 0

DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'
WORD_ID_FILE = 'SpamData/01_Processing/word-by-id.csv'

TRAINING_DATA_FILE = 'SpamData/02_Training/train-data.txt'
TEST_DATA_FILE = 'SpamData/02_Training/test-data.txt'

# Spam mail part 1

In [5]:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root,file_name)
            
            stream = open(filepath,encoding='latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)
                elif line == '\n':   # try printing lines directly, to remove new line character
                    is_body = True 
    
            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name,email_body

In [6]:
def df_from_directory(path,classification):
    rows = []
    row_names = []
    
    for file_name,email_body in email_body_generator(path):
        rows.append({'Message':email_body,'Category':classification})
        row_names.append(file_name)
    return pd.DataFrame(rows , index=row_names)  

In [7]:
spam_emails = df_from_directory(spam_1_path,SPAM_CAT)

#adding other files namely spam 2,easyham1,2
spam_emails = spam_emails.append(df_from_directory(spam_2_path,SPAM_CAT))


ham_emails = df_from_directory(easy_nonspam_1_path,HAM_CAT)
ham_emails = ham_emails.append(df_from_directory(easy_nonspam_2_path,HAM_CAT))

data = pd.concat([spam_emails,ham_emails])

data = data.drop(['cmds'])

In [8]:
document_ids = range(0,len(data.index))
data['DOC_ID']= document_ids

data['FILE_NAME'] = data.index
data = data.set_index('DOC_ID')

data.to_json(DATA_JSON_FILE)

# Spam mails part 2

In [9]:
def clean_msg_no_html( msgs , stop_words = set(stopwords.words('english')) , stemmer = PorterStemmer() ):
    
    soup = BeautifulSoup(msgs,'html.parser')
    message = soup.get_text()
    
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

# Spam mails part 3 

In [10]:
nested_list = data.Message.apply(clean_msg_no_html)

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
doc_ids_spam = data[data.Category == 1].index
doc_ids_ham = data[data.Category == 0].index

nested_list_ham = nested_list.loc[doc_ids_ham]
nested_list_spam = nested_list.loc[doc_ids_spam]

ham_words_list = [elements for sublist in nested_list_ham for elements in sublist]
spam_words_list = [elements for sublist in nested_list_spam for elements in sublist]

normal_words = pd.Series(ham_words_list).value_counts()
spam_words = pd.Series(spam_words_list).value_counts()



In [12]:
stemmed_nested_list = data.Message.apply(clean_msg_no_html)

flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]
unique_words = pd.Series(flat_stemmed_list).value_counts()

frequent_words = unique_words.iloc[0:2500]

In [13]:
word_ids = list(range(2500))
vocab = pd.DataFrame({'Vocab_Word':frequent_words.index.values},index=word_ids)
vocab.index.name = 'WORD_ID'

vocab.to_csv(WORD_ID_FILE, index_label=vocab.index.name, header = vocab.Vocab_Word.name )

# Generate features and a sparse matrix

### Creating a dataframe with one word per coloumn

In [14]:
type(stemmed_nested_list) 
type(stemmed_nested_list[2])
# this is a series of lists

# we will convert it to list of lists first
type(stemmed_nested_list.tolist()) # part of pandas .tolist()

list

In [15]:
word_columns_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_columns_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
0,save,life,insur,spend,life,quot,save,g,famili,financi,...,,,,,,,,,,
1,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
2,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
3,adult,club,offer,free,membership,instant,access,site,user,name,...,,,,,,,,,,
4,thought,might,like,slim,guarante,lose,lb,day,http,fight,...,,,,,,,,,,


### Splitting the data into training and testing dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, data.Category, test_size = .3, random_state = 42)

In [17]:
print('Number of training samples: ', X_train.shape[0])
print('Fraction of training set: ', X_train.shape[0]/word_columns_df.shape[0])

Number of training samples:  3430
Fraction of training set:  0.7


In [18]:
X_train.index.name = X_test.index.name = 'DOC_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
736,html,email,messag,see,mail,client,support,html,messag,industri,...,,,,,,,,,,
1620,e,eirikur,hallgrimsson,write,e,ca,tell,import,thing,pictur,...,,,,,,,,,,
336,name,jeremi,lessari,look,invest,properti,midwest,issu,everi,everyon,...,,,,,,,,,,
2302,also,tri,rpm,rebuilddb,chang,question,file,look,small,root,...,,,,,,,,,,
2673,tim,test,train,msg,binari,pickl,databas,approach,million,byte,...,,,,,,,,,,


In [19]:
y_train.head()

DOC_ID
736     1
1620    0
336     1
2302    0
2673    0
Name: Category, dtype: int64

### Create a Sparse Matrrix for training data

In [20]:
word_index = pd.Index(vocab.Vocab_Word)
print(type(word_index))
print(type(word_index[3]))

<class 'pandas.core.indexes.base.Index'>
<class 'str'>


In [21]:
print('Get word_id of word "html" in our vocabulary :',word_index.get_loc('html'))

Get word_id of word "html" in our vocabulary : 647


In [22]:
def make_sparse_matrix( df, indexed_words, labels ):
    """
    Return a sparse matrix as  dataframe
    
    df :A dataframe with words in columns with a  document id as index(X_train or X_test)
    indexed_words : index of words ordered by word id
    labels : category as a series(y_train or y_test)
    """
    
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'DOC_ID': doc_id,
                         'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
    
    return pd.DataFrame(dict_list)
    

In [23]:
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

In [24]:
sparse_train_df.shape

(351358, 4)

In [25]:
sparse_train_df[-5:]

Unnamed: 0,DOC_ID,LABEL,OCCURENCE,WORD_ID
351353,860,1,1,97
351354,860,1,1,15
351355,860,1,1,2
351356,860,1,1,79
351357,860,1,1,116


### Combine occurances with pandas groupby() method

In [26]:
train_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()
train_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
DOC_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,2,1,1
0,3,1,1
0,6,1,2
0,12,1,1
0,14,1,1


In [41]:
vocab.at[6, 'Vocab_Word']

'email'

In [42]:
data.Message[0]

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n\n<HTML><HEAD>\n\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\n\nype>\n\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n\n<BODY><!-- Inserted by Calypso -->\n\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\n\nules=3Dnone \n\nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n\n  <TBODY>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TD></TR>\n\n  <TR>\n\n    <TD colSpan=3D3>\n\n      <HR color=3Dblack noShade SIZE=3D1>\n\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n\n --><FONT \n\ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \n\nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\n\n0000

In [43]:
train_grouped = train_grouped.reset_index()
train_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,0,2,1,1
1,0,3,1,1
2,0,6,1,2
3,0,12,1,1
4,0,14,1,1


## Save training data as .txt file 

In [45]:
np.savetxt(TRAINING_DATA_FILE, train_grouped, fmt='%d' )

## Creating a sparse matrix for test data and saving in .txt form 

In [47]:
sparse_test_df = make_sparse_matrix( X_test, word_index, y_test )

In [48]:
sparse_test_df.shape

(151129, 4)

In [59]:
test_grouped = sparse_test_df.groupby(['DOC_ID','WORD_ID','LABEL']).sum()
test_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
DOC_ID,WORD_ID,LABEL,Unnamed: 3_level_1
8,2,1,1
8,3,1,2
8,4,1,2
8,5,1,1
8,6,1,4


In [60]:
vocab.Vocab_Word[4]

'one'

In [61]:
data.Message[8]

'TIRED OF THE BULL OUT THERE?\n\nWant To Stop Losing Money?\n\n\n\nWANT A REAL MONEY MAKER?\n\nRECEIVE $1,000-$5,000 TODAY!\n\nEXPERTS ARE CALLING THIS THE FASTEST WAY TO HUGE CASH FLOW EVER CONCEIVED!\n\n\n\nA POWERHOUSE Gifting Program You Don\'t Want To Miss!\n\nWe work as a TEAM! \n\n\n\nThis is YOUR Private Invitation GET IN WITH THE FOUNDERS! This is where the BIG BOYS PLAY! The MAJOR PLAYERS are on This ONE For ONCE be where the Players are\n\n\n\nThis is a system that will drive $1,000\'s to your doorstep \n\nIn a short period of time!\n\n\n\nLeverage $1000.00 into $50,000, Over and Over Again \n\n\n\nTHE QUESTION HERE IS:\n\n\n\nYOU EITHER WANT TO BE WEALTHY OR YOU DON\'T!!!\n\n\n\nWHICH ONE ARE YOU?\n\n\n\nI am tossing you a financial lifeline and for your sake I \n\n\n\nHope you GRAB onto it and hold on tight For the Ride of your life!\n\n\n\nTestimonials\n\n\n\nHear what average people are doing their first few days:\n\n\x93We\'ve received 8,000 in 1 day and we are doing th

In [62]:
test_grouped = test_grouped.reset_index()
test_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,8,2,1,1
1,8,3,1,2
2,8,4,1,2
3,8,5,1,1
4,8,6,1,4


In [63]:
np.savetxt(TEST_DATA_FILE, test_grouped, fmt='%d')

# Pre-processing subtleties and Checking your Understanding

### No. of emails droped 

In [65]:
train_doc_ids = set(train_grouped.DOC_ID)
test_doc_ids = set(test_grouped.DOC_ID)

In [66]:
len(train_doc_ids)

3395

In [67]:
len(test_doc_ids)

1447

In [68]:
len(X_train)

3430

In [69]:
len(X_test)

1470

In [71]:
X_test.index #stores doc ids

Int64Index([4657, 3539,  907, 4353, 3745, 4633, 2244, 1924, 3802, 2634,
            ...
            3386, 2754,  286, 3958, 4820, 4236, 4458, 3378, 1503, 1971],
           dtype='int64', name='DOC_ID', length=1470)

In [73]:
# convert it to set and compare with test_doc_ids
set(X_test.index.values) - test_doc_ids

{73,
 134,
 179,
 205,
 240,
 274,
 298,
 328,
 402,
 471,
 538,
 577,
 586,
 594,
 627,
 679,
 693,
 705,
 798,
 802,
 828,
 838,
 869}

In [79]:
data.Message[693] # hahahahah

'------=_NextPart_000_006C_6B8D6A65.3C3C56F\n\nContent-Type: text/plain\n\nContent-Transfer-Encoding: base64\n\n\n\nUGxlYXNlIGZvcmdpdmUgdGhlIGludHJ1c2lvbiwgdGhpcyBpcyBhIG9uZSB0aW1lIG9ubHkg\n\ndGVzdCwgcGxlYXNlIGRlbGV0ZS4gDQpZb3Ugc2hvdWxkIG5vdCByZWNlaXZlIGFueSBhZGRp\n\ndGlvbmFsIGVtYWlscyBmcm9tIHRoaXMgYWRkcmVzcywgaWYgeW91IGRvIHBsZWFzZSBzZW5k\n\nIGFuIGVtYWlsIA0Kd2l0aCByZW1vdmUgYXMgc3ViamVjdCB0bzoNCg0KdGVzdDc5OTFAeWFo\n\nb28uY29tDQoNClRoYW5rIHlvdSBmb3IgeW91ciB1bmRlcnN0YW5kaW5nLg0KDQogICAg\n\n------=_NextPart_000_006C_6B8D6A65.3C3C56F--\n\n\n'