# Add libraries
---

In [1]:
import os
import nltk
import random
from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load corpus datasets
---

In [None]:
!git clone https://github.com/pachocamacho1990/datasets
! unzip datasets/email/plaintext/corpus1.zip
! unzip datasets/email/plaintext/corpus2.zip
! unzip datasets/email/plaintext/corpus3.zip

In [4]:
# Get Text and labels from folders with plain text files
def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []

  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'.format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)

  return data, labels

In [5]:
def set_label_num(label_str):
  if label_str == "spam":
      return 1
  else:
      return 0
  

In [6]:
dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ["spam", "ham"])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ["spam", "ham"])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ["spam", "ham"])

In [7]:
data = dataCorpus1 + dataCorpus2 + dataCorpus3
labels = labelsCorpus1 + labelsCorpus2 + labelsCorpus3

In [8]:
len(data)

16347

In [9]:
dataframe = pd.DataFrame({'text': data, 'labels': labels})

In [10]:
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))

In [11]:
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

# Collocations
---

In [12]:
spamCorpus1, _ = get_text_labels_from_folders('corpus1', ["spam"])
spamCorpus2, _ = get_text_labels_from_folders('corpus2', ["spam"])
spamCorpus3, _ = get_text_labels_from_folders('corpus3', ["spam"])

In [13]:
spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

In [14]:
threshold = 3
bigram_measure = nltk.collocations.BigramAssocMeasures()
filtered_words = []
for text in spamCorpuses:
  filtered_words += [word for word in word_tokenize(text) if len(word) > threshold]

In [None]:
filtered_words

In [16]:
finder = BigramCollocationFinder.from_words(filtered_words)
finder.apply_freq_filter(120)
email_spam_collocations = finder.nbest(bigram_measure.pmi, 40)

In [None]:
email_spam_collocations

# Get document attributes
---

In [18]:
dataframe.head()

Unnamed: 0,text,labels,tokens,labels_num
0,Subject: dating service for nauuughty minded p...,spam,"[Subject, :, dating, service, for, nauuughty, ...",1
1,Subject: aylesbgry sgclude oisv msjgnkrlf\nhel...,spam,"[Subject, :, aylesbgry, sgclude, oisv, msjgnkr...",1
2,Subject: reverse aging ! new technology !\nrev...,spam,"[Subject, :, reverse, aging, !, new, technolog...",1
3,Subject: reduce monthly - payments\nyour appli...,spam,"[Subject, :, reduce, monthly, -, payments, you...",1
4,Subject: viewsonic airpanel vl 50 15 - inch sm...,spam,"[Subject, :, viewsonic, airpanel, vl, 50, 15, ...",1


In [19]:
def document_attributes(document):
  document_words = set(document)
  atrib = {}

  last_word = ""
  for word in document_words:
    atrib['contains({})'.format(word)] = (word in document_words)

    has_spam_word = False
    has_spam_collocation = False

    for bigram_position_0, bigram_position_1 in email_spam_collocations:
      if word == bigram_position_0 or word == bigram_position_1:
        has_spam_word = True
        if has_spam_collocation and has_spam_word:
          break

      if last_word != "" and ( last_word == bigram_position_0 and word == bigram_position_0 ):
        has_spam_collocation = True
        if has_spam_collocation and has_spam_word:
          break
    
    atrib['has_spam_word({})'.format(word)] = True if has_spam_word else False
    atrib['has_spam_collocation({})'.format(word)] = True if has_spam_collocation else False

    if (len(word) > 3):
      lastWord = word
    
  return atrib

In [20]:
dataframe.size

65388

In [21]:
print(dataframe['tokens'].values[0])
print()
print("===="*100)
print()
print(document_attributes(dataframe['tokens'].values[0]))

['Subject', ':', 'dating', 'service', 'for', 'nauuughty', 'minded', 'people', 'hi', '.', 'real', 'women', 'in', 'a', 'city', 'by', 'city', 'database', 'of', 'chics', 'that', 'like', 'to', 'cheaaatt', '.', 'some', 'like', 'submissive', ',', 'some', 'like', 'kinky', ',', 'some', 'like', 'to', 'be', 'forced', '.', 'check', 'them', 'out', '.', 'it', "'", 's', 'absolutely', 'nnno', '0', 'cost', 'to', 'try', 'it', 'and', 'to', 'peruse', 'the', 'database', 'for', 'free', '.', 'http', ':', '/', '/', 'dateeemywifxxp', '.', 'com', 'to', 'get', 'the', 'hell', 'out', 'of', 'this', 'stuff', 'go', 'to', 'here', '.', '.', '.', '.', 'http', ':', '/', '/', 'safebuyinghouse', '.', 'com', '/', 'tx']


{'contains(forced)': True, 'has_spam_word(forced)': False, 'has_spam_collocation(forced)': False, 'contains(be)': True, 'has_spam_word(be)': False, 'has_spam_collocation(be)': False, 'contains(real)': True, 'has_spam_word(real)': False, 'has_spam_collocation(real)': False, 'contains(here)': True, 'has_spam_

In [22]:
fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]
random.shuffle(fset)
print(len(fset))

16347


In [23]:
train, test = fset[:13078], fset[13078:]

In [24]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [25]:
print(nltk.classify.accuracy(classifier, test))

0.9816457632303457


In [26]:
# Save Model
import pickle
f = open('email_spam_ham_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [27]:
# Load model
import pickle
f = open('email_spam_ham_classifier.pickle', 'rb')
classifierLoaded = pickle.load(f)
f.close()

In [28]:
print(nltk.classify.accuracy(classifierLoaded, test))

0.9816457632303457
