# Add libraries
---

In [1]:
import os
import nltk
import random
from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load corpus datasets
---

In [None]:
!git clone https://github.com/pachocamacho1990/datasets
! unzip datasets/email/plaintext/corpus1.zip
! unzip datasets/email/plaintext/corpus2.zip
! unzip datasets/email/plaintext/corpus3.zip

In [4]:
# Get Text and labels from folders with plain text files
def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []

  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'.format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)

  return data, labels

In [5]:
def set_label_num(label_str):
  if label_str == "spam":
      return 1
  else:
      return 0
  

In [6]:
dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ["spam", "ham"])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ["spam", "ham"])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ["spam", "ham"])

In [7]:
data = dataCorpus1 + dataCorpus2 + dataCorpus3
labels = labelsCorpus1 + labelsCorpus2 + labelsCorpus3

In [8]:
len(data)

16347

In [9]:
dataframe = pd.DataFrame({'text': data, 'labels': labels})

In [10]:
dataframe = dataframe.sample(frac = 1) 

In [11]:
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))

In [12]:
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

# Collocations
---

In [13]:
def filter_words_by_threshold(text_tokenized, threshold = 3, ):
  words = []
  words = [word for word in text_tokenized if len(word) > threshold]
  return words

In [14]:
def get_n_grams_collocations_from_words(words, freq_filter = 10, n_best= 10,
                                       n_gran_measure = nltk.collocations.BigramAssocMeasures()):
  finder = BigramCollocationFinder.from_words(words)
  finder.apply_freq_filter(freq_filter)
  email_spam_collocations = finder.nbest(n_gran_measure.pmi, n_best)
  return email_spam_collocations

In [15]:
spamCorpus1, _ = get_text_labels_from_folders('corpus1', ["spam"])
spamCorpus2, _ = get_text_labels_from_folders('corpus2', ["spam"])
spamCorpus3, _ = get_text_labels_from_folders('corpus3', ["spam"])
spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

In [None]:
filtered_words = []
for text in spamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

In [None]:
email_spam_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
email_spam_collocations

# Get document attributes
---

In [18]:
dataframe.head()

Unnamed: 0,text,labels,tokens,labels_num
4256,Subject: re : tglo status\ncost centers 11814 ...,ham,"[Subject, :, re, :, tglo, status, cost, center...",0
7988,Subject: sitescooper : scoop websites onto you...,spam,"[Subject, :, sitescooper, :, scoop, websites, ...",1
7254,Subject: you were accepted . here is your mone...,spam,"[Subject, :, you, were, accepted, ., here, is,...",1
15518,Subject: re : tw ios posting privileged & conf...,ham,"[Subject, :, re, :, tw, ios, posting, privileg...",0
14486,Subject: cheapest drug & s on the net . . . gu...,spam,"[Subject, :, cheapest, drug, &, s, on, the, ne...",1


In [19]:
def document_attributes(document):
  document_words = set(document)
  atrib = {}

  last_word = ""
  for word in document_words:
    atrib['contains({})'.format(word)] = (word in document_words)

    has_spam_word = False

    for bigram_position_0, bigram_position_1 in email_spam_collocations:
      if word == bigram_position_0 or word == bigram_position_1:
        has_spam_word = True
        break
    
    atrib['has_spam_word({})'.format(word)] = has_spam_word

    if (len(word) > 3):
      lastWord = word

  filtered_words = filter_words_by_threshold(document)
  bigrams = get_n_grams_collocations_from_words(filtered_words, n_best=10, freq_filter=5)

  for i in range(len(bigrams)):
    atrib['bigram_collocation({})'.format(i)] = bigrams[i]
    
  return atrib

In [20]:
dataframe.size

65388

In [21]:
print(dataframe['tokens'].values[0])
print()
print("===="*100)
print()
print(document_attributes(dataframe['tokens'].values[0]))

['Subject', ':', 're', ':', 'tglo', 'status', 'cost', 'centers', '11814', 'and', '27117', 'are', 'rolling', 'to', 'texas', 'gas', 'trading', '.', '-', '-', '-', '-', '-', 'original', 'message', '-', '-', '-', '-', '-', 'from', ':', 'coffey', 'jr', '.', ',', 'jim', 'sent', ':', 'tuesday', ',', 'october', '30', ',', '2001', '10', ':', '05', 'am', 'to', ':', 'parker', ',', 'megan', ';', 'boyt', ',', 'eric', ';', 'farmer', ',', 'daren', 'j', '.', 'cc', ':', 'bryan', ',', 'gary', ';', 'pond', ',', 'jim', ';', 'helton', ',', 'susan', ';', 'east', ',', 'laynie', ';', 'roberts', ',', 'linda', 'subject', ':', 're', ':', 'tglo', 'status', 'megan', 'as', 'i', 'mentioned', 'to', 'you', 'yesterday', ',', 'it', 'still', 'looks', 'like', 'the', 'volumes', 'for', 'gas', 'delivered', 'to', 'tx', 'glo', 'do', 'not', 'have', 'linda', 'robert', "'", 's', 'origination', 'fee', 'deducted', '.', 'do', 'we', 'need', 'to', 'execute', 'a', 'pma', 'with', 'txglo', 'to', 'deduct', 'those', 'volumes', 'from', 'the

In [22]:
fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]
random.shuffle(fset)
print(len(fset))

16347


In [23]:
train, test = fset[:13078], fset[13078:]

In [24]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [25]:
print(nltk.classify.accuracy(classifier, test))

0.9798103395533803


In [26]:
# Save Model
import pickle
f = open('email_spam_ham_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [27]:
# Load model
import pickle
f = open('email_spam_ham_classifier.pickle', 'rb')
classifierLoaded = pickle.load(f)
f.close()

In [28]:
print(nltk.classify.accuracy(classifierLoaded, test))

0.9798103395533803
