# Add libraries
---

In [1]:
import os
import nltk
import random
from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load corpus datasets
---

In [None]:
!git clone https://github.com/pachocamacho1990/datasets
! unzip datasets/email/plaintext/corpus1.zip
! unzip datasets/email/plaintext/corpus2.zip
! unzip datasets/email/plaintext/corpus3.zip

In [4]:
# Get Text and labels from folders with plain text files
def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []

  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'.format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)

  return data, labels

In [5]:
def set_label_num(label_str):
  if label_str == "spam":
      return 1
  else:
      return 0
  

In [6]:
dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ["spam", "ham"])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ["spam", "ham"])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ["spam", "ham"])

In [7]:
data = dataCorpus1 + dataCorpus2 + dataCorpus3
labels = labelsCorpus1 + labelsCorpus2 + labelsCorpus3

In [8]:
len(data)

16347

In [9]:
dataframe = pd.DataFrame({'text': data, 'labels': labels})

In [10]:
dataframe = dataframe.sample(frac = 1) 

In [11]:
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))

In [12]:
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

# Get spam collocations and top words
---

In [13]:
def filter_words_by_threshold(text_tokenized, threshold = 3, ):
  words = []
  words = [word for word in text_tokenized if len(word) > threshold]
  return words

In [14]:
def get_n_grams_collocations_from_words(words, freq_filter = 10, n_best= 10,
                                       n_gran_measure = nltk.collocations.BigramAssocMeasures()):
  finder = BigramCollocationFinder.from_words(words)
  finder.apply_freq_filter(freq_filter)
  email_spam_collocations = finder.nbest(n_gran_measure.pmi, n_best)
  return email_spam_collocations

In [15]:
spamCorpus1, _ = get_text_labels_from_folders('corpus1', ["spam"])
spamCorpus2, _ = get_text_labels_from_folders('corpus2', ["spam"])
spamCorpus3, _ = get_text_labels_from_folders('corpus3', ["spam"])
spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

In [None]:
filtered_words = []
for text in spamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

In [None]:
email_spam_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
email_spam_collocations

In [None]:
all_spam_words = nltk.FreqDist([w for w in filtered_words])
top_spam_words = all_spam_words.most_common(200)
top_spam_words

# Get ham collocations and top words
---

In [19]:
hamCorpus1, _ = get_text_labels_from_folders('corpus1', ["ham"])
hamCorpus2, _ = get_text_labels_from_folders('corpus2', ["ham"])
hamCorpus3, _ = get_text_labels_from_folders('corpus3', ["ham"])
hamCorpuses = hamCorpus1 + hamCorpus2 + hamCorpus3

In [None]:
filtered_words = []
for text in hamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

In [None]:
email_ham_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
email_ham_collocations

In [None]:
all_ham_words = nltk.FreqDist([w for w in filtered_words])
top_ham_words = all_ham_words.most_common(200)
top_ham_words

# Filter top words
---

In [23]:
top_ham_words_iterator = top_ham_words
for word in top_ham_words_iterator:
  if word in top_ham_words and word in top_spam_words: 
    top_ham_words.remove(word)
    top_spam_words.remove(word)

In [24]:
len(top_ham_words)

200

# Get document attributes
---

In [25]:
dataframe.head()

Unnamed: 0,text,labels,tokens,labels_num
1553,Subject: tenaska iv gas\nare you going to make...,ham,"[Subject, :, tenaska, iv, gas, are, you, going...",0
16223,Subject: enron in action 08 . 07 . 00\nare you...,ham,"[Subject, :, enron, in, action, 08, ., 07, ., ...",0
919,Subject: legal operating systems for a third o...,spam,"[Subject, :, legal, operating, systems, for, a...",1
10410,Subject: re [ 7 ] : question with your health\...,spam,"[Subject, :, re, [, 7, ], :, question, with, y...",1
12932,Subject: want to enlarge your penis up to 5 in...,spam,"[Subject, :, want, to, enlarge, your, penis, u...",1


In [None]:
filtered_words = []
for text in data:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words
all_words = nltk.FreqDist([w for w in filtered_words])
top_words = all_words.most_common(200)
top_words

In [27]:
def document_attributes(document):
  document_words = set(document)
  atrib = {}
  for word in top_words:
    atrib['contains({})'.format(word)] = (word in document_words)
  
  for word in top_spam_words:
    atrib['contains_spam_word({})'.format(word)] = (word in document_words)

  for word in top_ham_words:
    atrib['contains_ham_word({})'.format(word)] = (word in document_words)

  
  for word in document_words:
    has_spam_word = False
    has_ham_word = False

    for bigram_position_0, bigram_position_1 in email_spam_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_spam_word = True
          break

    for bigram_position_0, bigram_position_1 in email_ham_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_ham_word = True
          break

      
    atrib['spam_word({})'.format(word)] = has_spam_word
    atrib['ham_word({})'.format(word)] = has_ham_word
    
    
  filtered_words = filter_words_by_threshold(document)
  bigrams = get_n_grams_collocations_from_words(filtered_words, n_best=10, freq_filter=5)

  for i in range(len(bigrams)):
    atrib['bigram_collocation({})'.format(i)] = bigrams[i]
    
  return atrib

In [28]:
dataframe.size

65388

In [None]:
print(dataframe['tokens'].values[0])
print()
print("===="*100)
print()
print(document_attributes(dataframe['tokens'].values[3]))

In [30]:
fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]
random.shuffle(fset)
print(len(fset))

16347


In [31]:
train, test = fset[:13078], fset[13078:]

In [32]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [33]:
print(nltk.classify.accuracy(classifier, test))

0.9828693790149893


In [34]:
# Save Model
import pickle
f = open('email_spam_ham_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [35]:
# Load model
import pickle
f = open('email_spam_ham_classifier.pickle', 'rb')
classifierLoaded = pickle.load(f)
f.close()

In [36]:
print(nltk.classify.accuracy(classifierLoaded, test))

0.9828693790149893


In [37]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: New porn web site
Hey, checkout the new best porn site: http://scam.com""")))

print("Spam" if email_classification == 1 else "Ham")

Spam


In [38]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: Congratulations
Hey Iram, congratulations for your new job, so well deserved!!""")))

print("Spam" if email_classification == 1 else "Ham")

Ham


In [39]:
email_classification = classifier.classify(document_attributes(word_tokenize("""Subject: Important news
Hi I'm an advocate for Mr. Marciano Martinez, my client who owns a large company.
He died a year ago and never had children. 
You have the same last name, if you help me, I can give you half of the heredity valued at 32,382,321.00 dollars""")))

print("Spam" if email_classification == 1 else "Ham")

Spam
