In [70]:
import matplotlib.pyplot as plt
import wordcloud
import logging
import collections
import re
import nltk
import pandas as pd
from PIL import Image

#Tokenizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

logger = logging.getLogger('FraudEmails')

#Minimum occurrence of a word to appear in the WordCloud
frequency = 100
#Number of words in WordCloud
numb_of_words = 400

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jseme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jseme\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [71]:
#Open and convert file to String
try:
    with open('../data/fraudulent_emails.txt','r') as file:
        text = file.read()
except Exception as e:
    logger.error('Process failed with error: '+repr(e))
finally:
    file.close()



In [72]:
#Delete everything between From r [.*?] Status: ?O
formatEmails = re.sub('From.*?Status: ?O','',text,flags=re.DOTALL)

In [73]:
#Convert to lower case
formatEmails = formatEmails.lower()


In [74]:
#Tokenize into string and remove stop words (the, a, on...)
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(formatEmails)
formatEmailsList = list(filter(lambda word: word not in nltk.corpus.stopwords.words('english'), tokens))

In [75]:

#Remove anything less than two characters
formatEmailsList = list(filter(lambda word: len(word)>2, formatEmailsList))

In [76]:
#Remove html tags

#Tags modified from https://www.w3schools.com/TAGS/default.ASP
htmldf = pd.read_csv('../data/html_tags.txt', sep='\t').dropna()
htmldf['Tag'] = htmldf['Tag'].str.strip()
htmldf.head()

finalEmailFormat = [word for word in formatEmailsList if '<'+word.strip()+'>' not in htmldf['Tag'].to_list()]

In [77]:
#Correspond every entry to tag from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
tags = nltk.pos_tag(finalEmailFormat)

In [78]:
df = pd.DataFrame(tags, columns=['word','tag'])
df.head()

Unnamed: 0,word,tag
0,james,NNS
1,ngola,VBP
2,confidential,JJ
3,tel,NN
4,233,CD


In [79]:
#Verb dataframe
dfverb = df[df['tag']=='VB']
dfverb = dfverb.word.value_counts().reset_index()
dfverb = dfverb[dfverb['word']>frequency]
dfverb.head()

Unnamed: 0,index,word
0,get,316
1,take,269
2,transfer,245
3,reply,244
4,please,237


In [80]:
#Noun dataframe
dfnoun = df[df['tag']=='NN']
dfnoun = dfnoun.word.value_counts().reset_index()
dfnoun = dfnoun[dfnoun['word']>frequency]
dfnoun.head()

Unnamed: 0,index,word
0,money,4142
1,bank,3595
2,account,2771
3,transaction,2213
4,business,1961


In [81]:
#Adjective dataframe
dfadj = df[df['tag']=='JJ']
dfadj = dfadj.word.value_counts().reset_index()
dfadj = dfadj[dfadj['word']>frequency]
dfadj.head()

Unnamed: 0,index,word
0,foreign,1599
1,next,1551
2,nbsp,1246
3,private,821
4,united,771


In [82]:
#Noun plural dataframe
dfnounpl = df[df['tag']=='NNS']
dfnounpl = dfnounpl.word.value_counts().reset_index()
dfnounpl = dfnounpl[dfnounpl['word']>frequency]
dfnounpl.head()

Unnamed: 0,index,word
0,dollars,1375
1,funds,1133
2,regards,629
3,years,625
4,states,598


In [14]:
#Keep JJ (adjectives), NN (singular nouns), NNS (plural nouns), VB (verb)
toKeep = ['VB','NN','JJ','NNS']

#Get only tag if tag is associated with above list
forWordCloud = [tag[0] for tag in tags if tag[1] in toKeep]


In [15]:
#Create dictionary with count of each entry
scamDict = collections.Counter(forWordCloud)


In [16]:
#Remove math equations/nonsense/single letters using previously defined dataframes

toRemove = ['nbsp','http','charset','iso','html','www']

for removal in toRemove:
    del scamDict[removal]

In [17]:
#Remove less than 100 instances
scamDict = {k: v for k, v in scamDict.items() if v > frequency}


In [18]:
Open mask and create word cloud
nigeriaMask = np.array(Image.open('../data/NigeriaOutline.jpg'))
scamCloud = wordcloud.WordCloud(width=800,height=800,
                                max_words=numb_of_words,mask=nigeriaMask,
                                contour_width=1,contour_color='green').generate_from_frequencies(scamDict)


In [19]:
#Create image
plt.figure(figsize=(30,30))
plt.imshow(scamCloud)
plt.axis('off')
plt.savefig('../output/ScamWordCloud.png',bbox_inches='tight',pad_inches=0,dpi=133)

In [20]:
#Output unformatted frequency
outputrawdf = pd.concat([dfadj, dfnoun, dfnounpl, dfverb]).sort_values(by=['word'],ascending=False).reset_index(drop=True)
outputrawdf.to_csv('../output/WordFrequency_Unprocessed.csv',index=False,header=False)
outputrawdf.head()

In [21]:
#Output formatted frequency
outputformatdf = pd.DataFrame.from_dict(scamDict, orient='index').sort_values(by=[0],ascending=False)
outputformatdf.to_csv('../output/WordFrequency_Processed.csv',header=False)
outputformatdf.head()