# Notebook imports

In [1]:
from os import walk    #to use walk from system os
from os.path import join

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline


In [36]:
import nltk 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup


# constants

In [3]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

spam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
spam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
easy_nonspam_1_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
easy_nonspam_2_path = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

SPAM_CAT = 1
HAM_CAT = 0

DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

# Previous code(5_1 Spam mail project)

In [4]:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root,file_name)
            
            stream = open(filepath,encoding='latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)
                elif line == '\n':   # try printing lines directly, to remove new line character
                    is_body = True 
    
            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name,email_body

         


In [5]:
def df_from_directory(path,classification):
    rows = []
    row_names = []
    
    for file_name,email_body in email_body_generator(path):
        rows.append({'Message':email_body,'Category':classification})
        row_names.append(file_name)
    return pd.DataFrame(rows , index=row_names)   

In [6]:
spam_emails = df_from_directory(spam_1_path,SPAM_CAT)

#adding other files namely spam 2,easyham1,2
spam_emails = spam_emails.append(df_from_directory(spam_2_path,SPAM_CAT))


ham_emails = df_from_directory(easy_nonspam_1_path,HAM_CAT)
ham_emails = ham_emails.append(df_from_directory(easy_nonspam_2_path,HAM_CAT))

data = pd.concat([spam_emails,ham_emails])

In [7]:
data = data.drop(['cmds'])

In [8]:
document_ids = range(0,len(data.index))
data['DOC_ID']= document_ids

data['FILE_NAME'] = data.index
data = data.set_index('DOC_ID')

data.to_json(DATA_JSON_FILE)

# Natural language processing

### Text Pre processing 

In [9]:
# convert to lower case
msg = 'All work and no play makes Jack a dull boy.'
msg.lower()

'all work and no play makes jack a dull boy.'

### Download NLTK resources( Tokenizer and stop words)

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Tokenising

In [10]:
msg = 'All work and no play makes Jack a dull boy.'
word_tokenize(msg.lower())

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']

## Removing stop words

In [11]:
# set is an unordered array
# frequency of every element is one
# good for looking differnces

In [12]:
type(stopwords.words('english'))


list

In [13]:
stop_words = set(stopwords.words('english'))
type(stop_words)

set

In [14]:
if 'this' in stop_words:
    print('presnt')

presnt


In [15]:
if 'hello' not in stop_words:
    print('not presnt')

not presnt


In [16]:
msg = 'All work and no play makes Jack a dull boy. to be or not to be.'
words = word_tokenize(msg.lower())

filtered_words = []
for word in words:
    if word not in stop_words:
        filtered_words.append(word)
print(filtered_words)    

['work', 'play', 'makes', 'jack', 'dull', 'boy', '.', '.']


## Word stems and stemming

In [17]:
msg = 'All work and no play makes Jack a dull boy. to be or not to be.\
         Nobody expects a SPAnish inquisition!'
words = word_tokenize(msg.lower())

filtered_words = []
for word in words:
    if word not in stop_words:
        # code for stemming
        stemmed_word = PorterStemmer().stem(word)
        
        filtered_words.append(stemmed_word)

print(filtered_words)    

['work', 'play', 'make', 'jack', 'dull', 'boy', '.', '.', 'nobodi', 'expect', 'spanish', 'inquisit', '!']


## Removing Punctuation 

In [18]:
msg = 'All work and no play makes Jack a dull boy. to be or not to be. ??? \
         Nobody expects a SPAnish inquisition! ?'
words = word_tokenize(msg.lower())

filtered_words = []
for word in words:
    if word not in stop_words and word.isalpha():
        
        stemmed_word = PorterStemmer().stem(word)
        
        filtered_words.append(stemmed_word)

print(filtered_words)    

['work', 'play', 'make', 'jack', 'dull', 'boy', 'nobodi', 'expect', 'spanish', 'inquisit']


## Removing HTML tags from Emails 

In [46]:
soup = BeautifulSoup(data.Message[0],'html.parser')
# Makes file prettier or readable
print(soup.prettify())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
 <head>
  <meta charset="utf-8" content='3D"text/html;' http-equiv="3DContent-T=" ype=""/>
  <meta 5.00.2314.1000"="" content='3D"MSHTML' name="3DGENERATOR"/>
 </head>
 <body>
  <!-- Inserted by Calypso -->
  <table black;="" border="3D0" cellpadding="3D0" cellspacing="3D2" display:="" id="3D_CalyPrintHeader_" none"="" r="ules=3Dnone" style='3D"COLOR:' width='3D"100%"'>
   <tbody>
    <tr>
     <td colspan="3D3">
      <hr color="3Dblack" noshade="" size="3D1"/>
     </td>
    </tr>
   </tbody>
  </table>
 </body>
</html>
<tr>
 <td colspan="3D3">
  <hr color="3Dblack" noshade="" size="3D1"/>
 </td>
</tr>
<!-- End Calypso -->
<!-- Inserted by Calypso=

 -->
<font color="3D#000000" face="3DVERDANA,ARIAL,HELVETICA" size="3D-2">
 <br/>
</font>
&lt;=

/TR&gt;
<!-- End Calypso -->
<font bold"="" color="3D#ff0000" face='3D"Copperplate' gothic="" ptsize='3D"10"' size="3D5">
 <center>
  Save up to 70% on Life Insurance.
 </ce

In [48]:
soup.get_text()

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n<=\n\n/TR>\nSave up to 70% on Life Insurance.\nWhy Spend More Than You Have To?\n\n\nLife Quote Savings\n\n\n\n\n\n\n\n\n\n\n\nEnsurin=\n\ng your \n\n      family's financial security is very important. Life Quote Savings ma=\n\nkes \n\n      buying life insurance simple and affordable. We Provide FREE Access =\n\nto The \n\n      Very Best Companies and The Lowest Rates.\n\n\n\n\n\nLife Quote Savings is FAST, EAS=\n\nY and \n\n            SAVES you money! Let us help you get started with the best val=\n\nues in \n\n            the country on new coverage. You can SAVE hundreds or even tho=\n\nusands \n\n            of dollars by requesting a FREE quote from Lifequote Savings. =\n\nOur \n\n            service will take you less than 5 minutes to complete. Shop an=\n\nd \n\n            compare. SAVE up to 70% on all types of Life insurance! \n\n\n\n\nClick Here For Your=\n\n \n\n            Free Quote!\n\nProtecting your family is the best investment you'll 

# Functions for Email Processing(5.22)

In [51]:
# Tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [54]:
# Returns list of filtered words
def clean_message(message):
    
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words

In [62]:
# Modify function to remove html tags
def clean_msg_no_html(msgs):
    
    soup = BeautifulSoup(msgs,'html.parser')
    message = soup.get_text()
    
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
            
    return filtered_words


In [63]:
clean_msg_no_html(data.Message[0])

['save',
 'life',
 'insur',
 'spend',
 'life',
 'quot',
 'save',
 'g',
 'famili',
 'financi',
 'secur',
 'import',
 'life',
 'quot',
 'save',
 'ke',
 'buy',
 'life',
 'insur',
 'simpl',
 'afford',
 'provid',
 'free',
 'access',
 'best',
 'compani',
 'lowest',
 'rate',
 'life',
 'quot',
 'save',
 'fast',
 'save',
 'money',
 'let',
 'us',
 'help',
 'get',
 'start',
 'best',
 'ue',
 'countri',
 'new',
 'coverag',
 'save',
 'hundr',
 'even',
 'usand',
 'dollar',
 'request',
 'free',
 'quot',
 'lifequot',
 'save',
 'servic',
 'take',
 'less',
 'minut',
 'complet',
 'shop',
 'compar',
 'save',
 'type',
 'life',
 'insur',
 'click',
 'free',
 'quot',
 'protect',
 'famili',
 'best',
 'invest',
 'r',
 'make',
 'receipt',
 'email',
 'error',
 'wish',
 'remov',
 'list',
 'pleas',
 'click',
 'type',
 'remov',
 'resid',
 'state',
 'prohibit',
 'solicit',
 'ce',
 'pleas',
 'disregard',
 'email']

## data.at[] is same as data