# Notebook imports

In [1]:
from os import walk    #to use walk from system os
from os.path import join

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline


In [148]:
import nltk 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup

import numpy as np

# Constants

In [72]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

spam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
spam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
easy_nonspam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
easy_nonspam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

SPAM_CAT = 1
HAM_CAT = 0

DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [73]:
WORD_ID_FILE = 'SpamData/01_Processing/word-by-id.csv'

## Spam mail part 1 (5_1)

In [4]:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root,file_name)
            
            stream = open(filepath,encoding='latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)
                elif line == '\n':   # try printing lines directly, to remove new line character
                    is_body = True 
    
            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name,email_body

In [5]:
def df_from_directory(path,classification):
    rows = []
    row_names = []
    
    for file_name,email_body in email_body_generator(path):
        rows.append({'Message':email_body,'Category':classification})
        row_names.append(file_name)
    return pd.DataFrame(rows , index=row_names)  

In [6]:
spam_emails = df_from_directory(spam_1_path,SPAM_CAT)

#adding other files namely spam 2,easyham1,2
spam_emails = spam_emails.append(df_from_directory(spam_2_path,SPAM_CAT))


ham_emails = df_from_directory(easy_nonspam_1_path,HAM_CAT)
ham_emails = ham_emails.append(df_from_directory(easy_nonspam_2_path,HAM_CAT))

data = pd.concat([spam_emails,ham_emails])

In [7]:
data = data.drop(['cmds'])

In [8]:
document_ids = range(0,len(data.index))
data['DOC_ID']= document_ids

data['FILE_NAME'] = data.index
data = data.set_index('DOC_ID')

data.to_json(DATA_JSON_FILE)

## Spam mails part 2 (5_2)

In [9]:
def clean_msg_no_html( msgs , stop_words = set(stopwords.words('english')) , stemmer = PorterStemmer() ):
    
    soup = BeautifulSoup(msgs,'html.parser')
    message = soup.get_text()
    
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

# Apply cleaning and tokenising to all functions

### Slicing Dataframes and series and creating subsets (5.24)

In [10]:
data.at[2,'Message']

"1) Fight The Risk of Cancer!\n\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n\n\n2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n\nhttp://www.adclick.ws/p.cfm?o=249&s=pk007\n\n\n\n3) Get the Child Support You Deserve - Free Legal Advice\n\nhttp://www.adclick.ws/p.cfm?o=245&s=pk002\n\n\n\n4) Join the Web's Fastest Growing Singles Community\n\nhttp://www.adclick.ws/p.cfm?o=259&s=pk007\n\n\n\n5) Start Your Private Photo Album Online!\n\nhttp://www.adclick.ws/p.cfm?o=283&s=pk007\n\n\n\nHave a Wonderful Day,\n\nOffer Manager\n\nPrizeMama\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIf you wish to leave this list please use the link below.\n\nhttp://www.qves.com/trim/?zzzz@spamassassin.taint.org%7C17%7C308417\n\n\n"

In [11]:
# Same as above but with no coloumn name and position address
data.iat[2,1]

"1) Fight The Risk of Cancer!\n\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n\n\n2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n\nhttp://www.adclick.ws/p.cfm?o=249&s=pk007\n\n\n\n3) Get the Child Support You Deserve - Free Legal Advice\n\nhttp://www.adclick.ws/p.cfm?o=245&s=pk002\n\n\n\n4) Join the Web's Fastest Growing Singles Community\n\nhttp://www.adclick.ws/p.cfm?o=259&s=pk007\n\n\n\n5) Start Your Private Photo Album Online!\n\nhttp://www.adclick.ws/p.cfm?o=283&s=pk007\n\n\n\nHave a Wonderful Day,\n\nOffer Manager\n\nPrizeMama\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIf you wish to leave this list please use the link below.\n\nhttp://www.qves.com/trim/?zzzz@spamassassin.taint.org%7C17%7C308417\n\n\n"

In [12]:
data.iloc[5:11] # Works on dataframe as well as a series as done below

Unnamed: 0_level_0,Category,Message,FILE_NAME
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,1,A POWERHOUSE GIFTING PROGRAM You Don't Want To...,00006.5ab5620d3d7c6c0db76234556a16f6c1
6,1,Help wanted. We are a 14 year old fortune 500...,00007.d8521faf753ff9ee989122f6816f87d7
7,1,<html>\n\n<head>\n\n<title>ReliaQuote - Save U...,00008.dfd941deb10f5eed78b1594b131c9266
8,1,TIRED OF THE BULL OUT THERE?\n\nWant To Stop L...,00009.027bf6e0b0c4ab34db3ce0ea4bf2edab
9,1,"Dear ricardo1 ,\n\n\n\n<html>\n\n<body>\n\n<ce...",00010.445affef4c70feec58f9198cfbc22997
10,1,Cellular Phone Accessories All At Below Wholes...,00011.61816b9ad167657773a427d890d0468e


In [13]:
data.Message.iloc[5:11]

DOC_ID
5     A POWERHOUSE GIFTING PROGRAM You Don't Want To...
6     Help wanted.  We are a 14 year old fortune 500...
7     <html>\n\n<head>\n\n<title>ReliaQuote - Save U...
8     TIRED OF THE BULL OUT THERE?\n\nWant To Stop L...
9     Dear ricardo1 ,\n\n\n\n<html>\n\n<body>\n\n<ce...
10    Cellular Phone Accessories All At Below Wholes...
Name: Message, dtype: object

In [14]:
## IMportant stuff  :)

first_emails = data.Message.iloc[0:3]

first_emails.apply(clean_msg_no_html)

# 'Apply' can be applied to series but not to strings 

DOC_ID
0    [save, life, insur, spend, life, quot, save, g...
1    [fight, risk, cancer, http, slim, guarante, lo...
2    [fight, risk, cancer, http, slim, guarante, lo...
Name: Message, dtype: object

In [15]:
initial_list = first_emails.apply(clean_msg_no_html)

In [16]:
# To make a combined list for first 3 rows of processed words
flat_list = []
for sublist in initial_list:
    for item in sublist:
        flat_list.append(item)

len(flat_list)        

192

In [17]:
# Python list comprehension syntax
# Same as above for rows 3 to 6
kp_emails = data.Message.iloc[3:6]
kp_list2 = kp_emails.apply(clean_msg_no_html)

flat_list2 = [ item for sublist in kp_list2 for item in sublist ]

len(flat_list2)

411

In [18]:
# Apply to all the messages

In [19]:
nested_list = data.Message.apply(clean_msg_no_html)
nested_list.head()

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


DOC_ID
0    [save, life, insur, spend, life, quot, save, g...
1    [fight, risk, cancer, http, slim, guarante, lo...
2    [fight, risk, cancer, http, slim, guarante, lo...
3    [adult, club, offer, free, membership, instant...
4    [thought, might, like, slim, guarante, lose, l...
Name: Message, dtype: object

In [20]:
nested_list.tail()

DOC_ID
4895    [http, bizarr, collect, stuf, anim, could, fet...
4896    [care, use, one, also, realli, cute, thing, ja...
4897    [sm, skip, montanaro, write, jeremi, put, anot...
4898    [mark, hammond, like, given, zodb, sound, attr...
4899    [hi, probabl, use, whatsoev, also, problem, re...
Name: Message, dtype: object

In [21]:
len(nested_list)

4900

### Using logic to slice dataframes (5.25)

In [22]:
data['Message']
data[data.Category == 1].shape

(1000, 3)

In [23]:
data[data.Category == 1].tail()

Unnamed: 0_level_0,Category,Message,FILE_NAME
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
995,1,<html>\n\n<head>\n\n<title>Toy</title>\n\n</he...,00496.1a37de098f6c8847c3c7839d73cc7106
996,1,<html>\n\n<head>\n\n<title>Untitled Document</...,00497.ebf699da617b11135f3aa9173b9781b9
997,1,This is an HTML email message. If you see thi...,00498.48c3098854d339353f1a28a13b196017
998,1,<html>\n\n<head>\n\n</head>\n\n <body backgro...,00499.988506a852cf86b396771a8bdc8cf839
999,1,"<STYLE type=""text/css"">\n\n<!--\n\nP{\n\n fon...",00500.85b72f09f6778a085dc8b6821965a76f


In [24]:
doc_ids_spam = data[data.Category == 1].index
doc_ids_ham = data[data.Category == 0].index

In [25]:
doc_ids_ham

Int64Index([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009,
            ...
            4890, 4891, 4892, 4893, 4894, 4895, 4896, 4897, 4898, 4899],
           dtype='int64', name='DOC_ID', length=3900)

In [26]:
doc_ids_spam

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
           dtype='int64', name='DOC_ID', length=1000)

### Subsetting a series with an index

In [27]:
type(doc_ids_ham)

pandas.core.indexes.numeric.Int64Index

In [28]:
type(nested_list)

pandas.core.series.Series

In [29]:
# Use .loc function on series and dataframes to  access elements by index
data.loc[doc_ids_ham]

nested_list_ham = nested_list.loc[doc_ids_ham]
nested_list_spam = nested_list.loc[doc_ids_spam]

In [30]:
# All words in spam mails into one and ham mails into one

In [31]:
ham_words_list = [elements for sublist in nested_list_ham for elements in sublist]
spam_words_list = [elements for sublist in nested_list_spam for elements in sublist]
len(spam_words_list)

167092

In [32]:
# Cant use unique and other functions on list
# That is why convert it to series

normal_words2 = pd.Series(ham_words_list)
normal_words2.shape

(441403,)

In [33]:
normal_words = pd.Series(ham_words_list).value_counts() # To save unique words
normal_words.shape[0]

20755

In [34]:
normal_words  # Also gives frequency directly

http            7561
use             3630
list            2878
one             2371
get             2284
mail            2255
would           2003
like            1928
messag          1847
work            1798
time            1740
wrote           1653
file            1583
new             1500
peopl           1474
user            1438
make            1393
linux           1374
group           1341
email           1276
chang           1230
think           1213
system          1196
also            1177
inform          1150
way             1147
could           1143
go              1118
said            1118
say             1110
                ... 
mercenari          1
poin               1
reinsur            1
geophysicist       1
julianu            1
righti             1
ltlibobj           1
paudwal            1
growl              1
disobey            1
fuss               1
cote               1
légaux             1
morozov            1
wearin             1
drummer            1
odyssield    

In [35]:
normal_words[:10]

http      7561
use       3630
list      2878
one       2371
get       2284
mail      2255
would     2003
like      1928
messag    1847
work      1798
dtype: int64

In [36]:
spam_words = pd.Series(spam_words_list).value_counts()
spam_words.shape

(7160,)

In [37]:
spam_words[0:15]

free       1300
email      1264
http       1234
receiv     1092
money      1010
list        962
pleas       950
get         932
order       828
name        822
click       820
busi        812
make        810
mail        784
address     768
dtype: int64

# Generate vocabulary and dictionary

In [38]:
stemmed_nested_list = data.Message.apply(clean_msg_no_html)
flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


In [42]:
unique_words = pd.Series(flat_stemmed_list).value_counts()

In [50]:
frequent_words = unique_words.iloc[0:2500]
print('Most common words:  \n',frequent_words[:20])


Most common words:  
 http      8795
use       4206
list      3840
get       3216
one       3121
mail      3039
email     2540
time      2448
would     2397
messag    2361
like      2300
work      2298
make      2203
peopl     2200
new       2110
inform    1918
free      1916
file      1689
wrote     1653
user      1610
dtype: int64


## Create vocabulary DataFrame with a WORD_ID ( 5.31 ) 

In [68]:
word_ids = list(range(2500))
vocab = pd.DataFrame({'Vocab_Word':frequent_words.index.values},index=word_ids)
vocab.index.name = 'WORD_ID'

In [70]:
vocab.head()

Unnamed: 0_level_0,Vocab_Word
WORD_ID,Unnamed: 1_level_1
0,http
1,use
2,list
3,get
4,one


## Save the vocabulary as a CSV file

In [74]:
vocab.to_csv(WORD_ID_FILE, index_label=vocab.index.name, header = vocab.Vocab_Word.name )

### Exercise: check if particular word is  part of library (5.32)

In [114]:
# machine learning fun learn data science app brewry
# these are the words
test_words = ['machine','learning','fun','learn','data','science','app','brewry' ]
test_words = set(test_words)

In [115]:
type(test_words)

set

In [103]:
any(vocab.Vocab_Word == 'data') # inefficent way

True

In [108]:
'data' in vocab.Vocab_Word  # wrong answer because "in" works on set and not on DataFrames


False

In [113]:
'data' in set(vocab.Vocab_Word) # here right answer and it works

True

## Exercise : Find the Emails with most number of words ( 5.33 )

In [131]:
# My solution
max_count = 0
i = 0
for sublist in nested_list:
    count_words = 0
    for item in sublist:
        count_words = count_words + 1
    if count_words > max_count:
        max_count = count_words
        location = i
    i = i+1    

In [149]:
# OR
clean_email_length = []
for sublist in nested_list:
    clean_email_length.append(len(sublist))
print('Number of words in longest email', max(clean_email_length))
 
print( 'Email position', np.argmax( clean_email_length ) )


Number of words in longest email 7671
Email position 4879


In [None]:
# Python solution (list comprehension )
clean_email_length[ len(sublist) for sublist in nested_list ]
