# Add libraries
---

In [1]:
import os
import nltk
import random
from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load corpus datasets
---

In [3]:
!git clone https://github.com/pachocamacho1990/datasets
! unzip datasets/email/plaintext/corpus1.zip
! unzip datasets/email/plaintext/corpus2.zip
! unzip datasets/email/plaintext/corpus3.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: corpus3/spam/2588.2005-01-07.BG.spam.txt  
  inflating: corpus3/spam/0616.2004-09-15.BG.spam.txt  
  inflating: corpus3/spam/2275.2004-12-22.BG.spam.txt  
  inflating: corpus3/spam/0687.2004-09-19.BG.spam.txt  
  inflating: corpus3/spam/1151.2004-10-21.BG.spam.txt  
  inflating: corpus3/spam/4203.2005-04-02.BG.spam.txt  
  inflating: corpus3/spam/4263.2005-04-06.BG.spam.txt  
  inflating: corpus3/spam/4731.2005-05-07.BG.spam.txt  
  inflating: corpus3/spam/3005.2005-01-25.BG.spam.txt  
  inflating: corpus3/spam/4744.2005-05-08.BG.spam.txt  
  inflating: corpus3/spam/1371.2004-11-02.BG.spam.txt  
  inflating: corpus3/spam/3408.2005-02-12.BG.spam.txt  
  inflating: corpus3/spam/4523.2005-04-24.BG.spam.txt  
  inflating: corpus3/spam/2991.2005-01-24.BG.spam.txt  
  inflating: corpus3/spam/4529.2005-04-24.BG.spam.txt  
  inflating: corpus3/spam/1376.2004-11-02.BG.spam.txt  
  inflating: corpus3/spam/2971.2005-01-

In [4]:
# Get Text and labels from folders with plain text files
def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []

  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'.format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)

  return data, labels

In [5]:
def set_label_num(label_str):
  if label_str == "spam":
      return 1
  else:
      return 0
  

In [6]:
dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ["spam", "ham"])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ["spam", "ham"])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ["spam", "ham"])

In [7]:
data = dataCorpus1 + dataCorpus2 + dataCorpus3
labels = labelsCorpus1 + labelsCorpus2 + labelsCorpus3

In [8]:
len(data)

16347

In [9]:
dataframe = pd.DataFrame({'text': data, 'labels': labels})

In [10]:
dataframe = dataframe.sample(frac = 1) 

In [11]:
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))

In [12]:
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

# Get spam collocations and top words
---

In [13]:
def filter_words_by_threshold(text_tokenized, threshold = 3, ):
  words = []
  words = [word for word in text_tokenized if len(word) > threshold]
  return words

In [14]:
def get_n_grams_collocations_from_words(words, freq_filter = 10, n_best= 10,
                                       n_gran_measure = nltk.collocations.BigramAssocMeasures()):
  finder = BigramCollocationFinder.from_words(words)
  finder.apply_freq_filter(freq_filter)
  email_spam_collocations = finder.nbest(n_gran_measure.pmi, n_best)
  return email_spam_collocations

In [15]:
spamCorpus1, _ = get_text_labels_from_folders('corpus1', ["spam"])
spamCorpus2, _ = get_text_labels_from_folders('corpus2', ["spam"])
spamCorpus3, _ = get_text_labels_from_folders('corpus3', ["spam"])
spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

In [16]:
filtered_words = []
for text in spamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

['Subject',
 'dating',
 'service',
 'nauuughty',
 'minded',
 'people',
 'real',
 'women',
 'city',
 'city',
 'database',
 'chics',
 'that',
 'like',
 'cheaaatt',
 'some',
 'like',
 'submissive',
 'some',
 'like',
 'kinky',
 'some',
 'like',
 'forced',
 'check',
 'them',
 'absolutely',
 'nnno',
 'cost',
 'peruse',
 'database',
 'free',
 'http',
 'dateeemywifxxp',
 'hell',
 'this',
 'stuff',
 'here',
 'http',
 'safebuyinghouse',
 'Subject',
 'aylesbgry',
 'sgclude',
 'oisv',
 'msjgnkrlf',
 'hello',
 'generic',
 'super',
 'viagra',
 'cialis',
 'available',
 'online',
 'most',
 'trusted',
 'online',
 'source',
 'cialis',
 'super',
 'viag',
 'takes',
 'affect',
 'right',
 'away',
 'lasts',
 'hours',
 'super',
 'viagra',
 'click',
 'here',
 'generic',
 'viagra',
 'costs',
 'less',
 'save',
 'money',
 'viagra',
 'click',
 'here',
 'both',
 'products',
 'shipped',
 'discretely',
 'your',
 'door',
 'interested',
 'afwkny',
 'tfylaytgu',
 'usvqvkd',
 'gyxzc',
 'zyyqywp',
 'obhqrbqtmx',
 'thimy',

In [17]:
email_spam_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
email_spam_collocations

[('unnecessary', 'delays'),
 ('delays', 'complications'),
 ('therein', 'misleading'),
 ('verge', 'bankruptcy'),
 ('litigation', 'reform'),
 ('tras', 'expe'),
 ('ects', 'tras'),
 ('ural', 'ects'),
 ('reas', 'ural'),
 ('reform', '1995'),
 ('ppin', 'xual'),
 ('expe', 'rien'),
 ('rien', 'ppin'),
 ('xual', 'reas'),
 ('cubic', 'feet'),
 ('rights', 'reserved'),
 ('accuracy', 'completeness'),
 ('notes', 'contents'),
 ('poised', 'positioned'),
 ('foresee', 'expects'),
 ('charset', '8859'),
 ('double', 'claiming'),
 ('attached', 'ticket'),
 ('inherent', 'conflict'),
 ('broker', 'dealer'),
 ('respect', 'predictions'),
 ('drew', 'lucky'),
 ('normal', 'saave'),
 ('advises', 'readers'),
 ('avoid', 'unnecessary'),
 ('projects', 'foresee'),
 ('proved', 'reserves'),
 ('projecthoneypot', 'projecthoneypot'),
 ('believes', 'understands'),
 ('construed', 'kind'),
 ('aware', 'inherent'),
 ('nbsp', 'nbsp'),
 ('beliefs', 'plans'),
 ('perfect', 'timing'),
 ('press', 'releases')]

In [18]:
all_spam_words = nltk.FreqDist([w for w in filtered_words])
top_spam_words = all_spam_words.most_common(200)
top_spam_words

[('this', 20904),
 ('your', 20748),
 ('that', 12836),
 ('with', 12011),
 ('from', 10127),
 ('Subject', 9675),
 ('have', 8743),
 ('will', 7908),
 ('company', 6889),
 ('email', 4856),
 ('http', 4752),
 ('information', 4681),
 ('here', 4670),
 ('more', 4585),
 ('please', 4315),
 ('statements', 4020),
 ('business', 3604),
 ('money', 3443),
 ('time', 3345),
 ('only', 3140),
 ('mail', 2956),
 ('which', 2888),
 ('within', 2874),
 ('over', 2846),
 ('been', 2837),
 ('free', 2784),
 ('report', 2760),
 ('price', 2727),
 ('these', 2593),
 ('about', 2551),
 ('just', 2417),
 ('account', 2410),
 ('investment', 2314),
 ('click', 2271),
 ('make', 2265),
 ('other', 2237),
 ('they', 2181),
 ('their', 2166),
 ('future', 2156),
 ('software', 2126),
 ('securities', 2083),
 ('message', 2073),
 ('online', 2060),
 ('adobe', 2054),
 ('forward', 2044),
 ('best', 2019),
 ('like', 2008),
 ('stock', 2003),
 ('what', 1976),
 ('need', 1963),
 ('through', 1960),
 ('into', 1954),
 ('address', 1916),
 ('looking', 1892),

# Get ham collocations and top words
---

In [19]:
hamCorpus1, _ = get_text_labels_from_folders('corpus1', ["ham"])
hamCorpus2, _ = get_text_labels_from_folders('corpus2', ["ham"])
hamCorpus3, _ = get_text_labels_from_folders('corpus3', ["ham"])
hamCorpuses = hamCorpus1 + hamCorpus2 + hamCorpus3

In [20]:
filtered_words = []
for text in hamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

['Subject',
 'enron',
 'actuals',
 'june',
 '2000',
 'teco',
 'iferc',
 'texoma',
 'iferc',
 'enron',
 'Subject',
 'meter',
 '6315',
 'purch',
 'from',
 'torch',
 'rally',
 'october',
 'daren',
 'show',
 'that',
 'extended',
 'purchase',
 'deal',
 'ticket',
 '461059',
 'cover',
 'flow',
 'first',
 'days',
 'however',
 'that',
 'measurement',
 'closed',
 'show',
 'that',
 'have',
 'flow',
 'this',
 'meter',
 'entire',
 'month',
 'with',
 'exception',
 'days',
 'middle',
 'please',
 'know',
 'want',
 'extend',
 'this',
 'deal',
 'should',
 'this',
 'strangers',
 'until',
 'determine',
 'what',
 'dreaded',
 'list',
 'from',
 'mgmt',
 'mary',
 'Subject',
 'management',
 'agreement',
 'daren',
 'will',
 'there',
 'ever',
 'need',
 'utilize',
 'transport',
 'supply',
 'plant',
 'other',
 'than',
 'through',
 'lone',
 'star',
 'agreements',
 'would',
 'using',
 'tenaska',
 'transport',
 'agreements',
 'Subject',
 'nomination',
 'tejas',
 'forwarded',
 'chokshi',
 'corp',
 'enron',
 '2000',
 '

In [21]:
email_ham_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
email_ham_collocations

[('donald', 'reinhardt'),
 ('cotton', 'valley'),
 ('anita', 'luong'),
 ('fuels', 'cotton'),
 ('fernley', 'dyson'),
 ('carlos', 'rodriguez'),
 ('brenda', 'herod'),
 ('howard', 'camp'),
 ('rita', 'wynne'),
 ('shona', 'wilson'),
 ('aimee', 'lannou'),
 ('vice', 'president'),
 ('julie', 'meyers'),
 ('melissa', 'graves'),
 ('george', 'weissman'),
 ('jackie', 'young'),
 ('vance', 'taylor'),
 ('gary', 'hanks'),
 ('michelle', 'lokay'),
 ('feel', 'free'),
 ('mike', 'jordan'),
 ('mary', 'solmonson'),
 ('north', 'america'),
 ('melissa', 'jones'),
 ('robert', 'lloyd'),
 ('robert', 'cotten'),
 ('soon', 'possible'),
 ('sally', 'beck'),
 ('north', 'american'),
 ('office', 'chairman'),
 ('original', 'message'),
 ('susan', 'smith'),
 ('texas', 'utilities'),
 ('brent', 'price'),
 ('make', 'sure'),
 ('daren', 'farmer'),
 ('business', 'unit'),
 ('look', 'forward'),
 ('attached', 'file'),
 ('entered', 'into')]

In [22]:
all_ham_words = nltk.FreqDist([w for w in filtered_words])
top_ham_words = all_ham_words.most_common(200)
top_ham_words

[('enron', 17055),
 ('this', 10319),
 ('will', 9232),
 ('that', 8695),
 ('have', 7486),
 ('with', 6924),
 ('Subject', 6672),
 ('2000', 6574),
 ('from', 6542),
 ('please', 5225),
 ('your', 5054),
 ('subject', 4344),
 ('2001', 3366),
 ('deal', 3192),
 ('thanks', 2923),
 ('corp', 2790),
 ('know', 2524),
 ('meter', 2509),
 ('would', 2482),
 ('need', 2279),
 ('been', 2094),
 ('mmbtu', 2045),
 ('forwarded', 1996),
 ('should', 1987),
 ('information', 1957),
 ('they', 1943),
 ('daren', 1910),
 ('energy', 1899),
 ('these', 1894),
 ('there', 1854),
 ('attached', 1798),
 ('time', 1798),
 ('houston', 1713),
 ('questions', 1667),
 ('which', 1615),
 ('business', 1605),
 ('what', 1601),
 ('also', 1583),
 ('about', 1569),
 ('their', 1512),
 ('more', 1490),
 ('call', 1481),
 ('into', 1462),
 ('message', 1407),
 ('meeting', 1358),
 ('sent', 1354),
 ('management', 1346),
 ('some', 1328),
 ('price', 1324),
 ('risk', 1309),
 ('contract', 1295),
 ('mail', 1290),
 ('power', 1229),
 ('following', 1224),
 ('ca

# Filter top words
---

In [23]:
top_ham_words_iterator = top_ham_words
for word in top_ham_words_iterator:
  if word in top_ham_words and word in top_spam_words: 
    top_ham_words.remove(word)
    top_spam_words.remove(word)

In [24]:
len(top_ham_words)

200

# Get document attributes
---

In [25]:
dataframe.head()

Unnamed: 0,text,labels,tokens,labels_num
1553,Subject: tenaska iv gas\nare you going to make...,ham,"[Subject, :, tenaska, iv, gas, are, you, going...",0
16223,Subject: enron in action 08 . 07 . 00\nare you...,ham,"[Subject, :, enron, in, action, 08, ., 07, ., ...",0
919,Subject: legal operating systems for a third o...,spam,"[Subject, :, legal, operating, systems, for, a...",1
10410,Subject: re [ 7 ] : question with your health\...,spam,"[Subject, :, re, [, 7, ], :, question, with, y...",1
12932,Subject: want to enlarge your penis up to 5 in...,spam,"[Subject, :, want, to, enlarge, your, penis, u...",1


In [26]:
filtered_words = []
for text in data:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words
all_words = nltk.FreqDist([w for w in filtered_words])
top_words = all_words.most_common(200)
top_words

[('this', 31223),
 ('your', 25802),
 ('that', 21531),
 ('with', 18935),
 ('will', 17140),
 ('enron', 17058),
 ('from', 16669),
 ('Subject', 16347),
 ('have', 16229),
 ('please', 9540),
 ('company', 7995),
 ('2000', 6918),
 ('information', 6638),
 ('more', 6075),
 ('email', 5601),
 ('here', 5591),
 ('http', 5502),
 ('subject', 5450),
 ('business', 5209),
 ('time', 5143),
 ('been', 4931),
 ('which', 4503),
 ('these', 4487),
 ('mail', 4246),
 ('need', 4242),
 ('would', 4214),
 ('only', 4133),
 ('they', 4124),
 ('about', 4120),
 ('statements', 4070),
 ('price', 4051),
 ('know', 3995),
 ('report', 3950),
 ('over', 3794),
 ('there', 3731),
 ('their', 3678),
 ('thanks', 3638),
 ('money', 3580),
 ('what', 3577),
 ('deal', 3540),
 ('2001', 3520),
 ('should', 3484),
 ('message', 3480),
 ('just', 3428),
 ('into', 3416),
 ('other', 3342),
 ('also', 3341),
 ('within', 3317),
 ('free', 3235),
 ('make', 3187),
 ('like', 3132),
 ('corp', 3055),
 ('through', 2876),
 ('forward', 2830),
 ('click', 2796),

In [27]:
def document_attributes(document):
  document_words = set(document)
  atrib = {}
  for word in top_words:
    atrib['contains({})'.format(word)] = (word in document_words)
  
  for word in top_spam_words:
    atrib['contains_spam_word({})'.format(word)] = (word in document_words)

  for word in top_ham_words:
    atrib['contains_ham_word({})'.format(word)] = (word in document_words)

  
  for word in document_words:
    has_spam_word = False
    has_ham_word = False

    for bigram_position_0, bigram_position_1 in email_spam_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_spam_word = True
          break

    for bigram_position_0, bigram_position_1 in email_ham_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_ham_word = True
          break

      
    atrib['spam_word({})'.format(word)] = has_spam_word
    atrib['ham_word({})'.format(word)] = has_ham_word
    
    
  filtered_words = filter_words_by_threshold(document)
  bigrams = get_n_grams_collocations_from_words(filtered_words, n_best=10, freq_filter=5)

  for i in range(len(bigrams)):
    atrib['bigram_collocation({})'.format(i)] = bigrams[i]
    
  return atrib

In [28]:
dataframe.size

65388

In [29]:
print(dataframe['tokens'].values[0])
print()
print("===="*100)
print()
print(document_attributes(dataframe['tokens'].values[3]))

['Subject', ':', 'tenaska', 'iv', 'gas', 'are', 'you', 'going', 'to', 'make', 'the', 'price', 'changes', 'to', 'the', 'tenaska', 'iv', 'sale', '384258', 'for', '10', '/', '00', 'and', '11', '/', '00', 'that', 'james', 'details', 'below', '?', 'right', 'now', ',', 'we', 'are', 'showing', 'a', 'short', 'pay', 'of', '$', '351', ',', '201', '.', '49', 'for', 'those', 'two', 'months', '.', 'megan', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'forwarded', 'by', 'megan', 'parker', '/', 'corp', '/', 'enron', 'on', '12', '/', '29', '/', '2000', '11', ':', '46', 'am', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'james', 'armstrong', '@', 'ect', '12', '/', '19', '/', '2000', '10', ':', '31', 'am', 'to', ':', 'daren', 'j', 'farmer', '/', 'hou', '/', 'ect', '@', 'ect', 'cc', ':', 'megan', 'parker', '/', 'corp', '/', 'enron', '@', 'enron', ',', 

In [30]:
fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]
random.shuffle(fset)
print(len(fset))

16347


In [31]:
train, test = fset[:13078], fset[13078:]

In [32]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [33]:
print(nltk.classify.accuracy(classifier, test))

0.9828693790149893


In [34]:
# Save Model
import pickle
f = open('email_spam_ham_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [35]:
# Load model
import pickle
f = open('email_spam_ham_classifier.pickle', 'rb')
classifierLoaded = pickle.load(f)
f.close()

In [36]:
print(nltk.classify.accuracy(classifierLoaded, test))

0.9828693790149893


In [37]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: New porn web site
Hey, checkout the new best porn site: http://scam.com""")))

print("Spam" if email_classification == 1 else "Ham")

Spam


In [38]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: Congratulations
Hey Iram, congratulations for your new job, so well deserved!!""")))

print("Spam" if email_classification == 1 else "Ham")

Ham


In [39]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: Important news
Hi I'm an advocate for Mr. Marciano Martinez, my client who owns a large company.
He died a year ago and never had children. 
You have the same last name, if you help me, I can give you half of the heredity valued at 32,382,321.00 dollars""")))

print("Spam" if email_classification == 1 else "Ham")

Spam
