# Spam Detector

In [1]:
# for filesystem access
import os
# for Unix filename pattern matching
import fnmatch
# for data analysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
# for natural language processing
import nltk.tokenize as tokenizer
from nltk import pos_tag, word_tokenize, sent_tokenize
# for regular expression operations
import re
# for zipping lists
from functools import reduce
# for computing word syllables
import pyphen
# for spelling mistakes
import enchant
# for utilities
import numpy

In [2]:
LINGSPAM_BARE_DATASET_PATH = "datasources/lingspam/bare"
SPAM_TERM_LIST_PATH = "datasources/wcling/spam-term-list.txt"

In [3]:
documents = []
labels = []

In [4]:
def is_spam_file_name(file_name):
    return fnmatch.fnmatchcase(file_name, 'spmsg*')

## Reading and Preprocessing Data

### Read all the emails in the ten folders & save the labels (spam/not spam, or 0/1) of each email to a list

In [5]:
for root, dirs, file_names in os.walk(LINGSPAM_BARE_DATASET_PATH):
    for file_name in fnmatch.filter(file_names, '*.txt'):
        with open(os.path.join(root, file_name), 'r') as file:
            documents.append(file.read())
            labels.append(1 if is_spam_file_name(file_name) else 0)

In [6]:
documents_length = len(documents)

if documents_length > 0:
    print("✅ Read %i documents" % len(documents))
else:
    print("❌ Could not read any documents")

✅ Read 2893 documents


### Split the emails & labels into 80% training & 20% testing

In [7]:
training_documents_count = round(documents_length * 0.8)

training_documents = documents[:training_documents_count]
training_labels = labels[:training_documents_count]

testing_documents = documents[training_documents_count:]
testing_labels = labels[training_documents_count:]

### Fit and transform the training emails & transform the testing emails using a CountVectorizer

In [8]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(training_documents)

training_document_term_matrix = count_vectorizer.transform(training_documents)
testing_document_term_matrix = count_vectorizer.transform(testing_documents)

## Scikit-Learn Classifiers

### Multinomial Naive Bayes

In [9]:
naive_bayes_classifier = MultinomialNB(alpha=1)
naive_bayes_classifier.fit(training_document_term_matrix, training_labels)

naive_bayes_classifier_predictions = naive_bayes_classifier.predict(testing_document_term_matrix)

naive_bayes_classifier_precision_score = metrics.precision_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_recall_score = metrics.recall_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_f1_score = metrics.f1_score(testing_labels, naive_bayes_classifier_predictions, average='macro')

print("🔎 Naive Bayes classifier precision score: %f" % naive_bayes_classifier_precision_score)
print("🔎 Naive Bayes classifier recall score: %f" % naive_bayes_classifier_recall_score)
print("🔎 Naive Bayes classifier f-score: %f" % naive_bayes_classifier_f1_score)

🔎 Naive Bayes classifier precision score: 0.975248
🔎 Naive Bayes classifier recall score: 0.994824
🔎 Naive Bayes classifier f-score: 0.984708


### K Neighbors Classifier

In [10]:
kneighbors_classifier = KNeighborsClassifier(n_neighbors=3)
kneighbors_classifier.fit(training_document_term_matrix, training_labels)

kneighbors_classifier_predictions = kneighbors_classifier.predict(testing_document_term_matrix)

kneighbors_classifier_precision_score = metrics.precision_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_recall_score = metrics.recall_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_f1_score = metrics.f1_score(testing_labels, kneighbors_classifier_predictions, average='macro')

print("🔎 K Neighbors classifier precision score: %f" % kneighbors_classifier_precision_score)
print("🔎 K Neighbors classifier recall score: %f" % kneighbors_classifier_recall_score)
print("🔎 K Neighbors classifier f-score: %f" % kneighbors_classifier_f1_score)

🔎 K Neighbors classifier precision score: 0.912112
🔎 K Neighbors classifier recall score: 0.955325
🔎 K Neighbors classifier f-score: 0.931835


### Random Forest Classifier

In [11]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier.fit(training_document_term_matrix, training_labels)

random_forest_classifier_predictions = random_forest_classifier.predict(testing_document_term_matrix)

random_forest_classifier_precision_score = metrics.precision_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_recall_score = metrics.recall_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_f1_score = metrics.f1_score(testing_labels, random_forest_classifier_predictions, average='macro')

print("🔎 Random Forest classifier precision score: %f" % random_forest_classifier_precision_score)
print("🔎 Random Forest classifier recall score: %f" % random_forest_classifier_recall_score)
print("🔎 Random Forest classifier f-score: %f" % random_forest_classifier_f1_score)

🔎 Random Forest classifier precision score: 0.970471
🔎 Random Forest classifier recall score: 0.879173
🔎 Random Forest classifier f-score: 0.917266


## Classifying using Readability Features

### Preprocessing

In [12]:
documents_word_tokenized = [word_tokenize(document) for document in documents]
documents_sentence_tokenized =  [sent_tokenize(document) for document in documents]
documents_word_tagged = [pos_tag(document_word_tokenized) for document_word_tokenized in documents_word_tokenized]

spam_term_list = list(filter(lambda x: x, open(SPAM_TERM_LIST_PATH, "r").read().split('\n')))
spam_term_list = [spam_sentence.lower() for spam_sentence in spam_term_list]                  

dictionary = pyphen.Pyphen(lang='en_GB')
documents_syllabafied = list(map(lambda document_word_tokenized: list(map(lambda word: len(dictionary.inserted(word).split('-')), document_word_tokenized)), documents_word_tokenized))

spelling_dictionary = enchant.Dict('en_US')

#### F1: The number of sentences in an email

In [13]:
f1 = [len(document_sentence_tokenized) for document_sentence_tokenized in documents_sentence_tokenized]

In [14]:
print("🔎 Number of sentences of 1st 10 emails: %s" % f1[:10])
not spelling_dictionary.check('hello')

🔎 Number of sentences of 1st 10 emails: [22, 35, 19, 4, 12, 58, 31, 11, 14, 18]


False

#### F2: The number of verbs in an email

In [15]:
f2 = [len(list(filter(lambda word_tagged: word_tagged[1] == 'VB', document_word_tagged))) for document_word_tagged in documents_word_tagged]

In [16]:
print("🔎 Number of verbs in 1st 10 emails: %s" % f2[:10])

🔎 Number of verbs in 1st 10 emails: [6, 4, 10, 3, 10, 30, 11, 8, 16, 3]


#### F3: The number of words containing both numeric and alphabetical characters

In [17]:
f3 = [len(list(filter(lambda word: re.search(r"[a-zA-Z]+", word) and re.search(r"[1-9]+", word), document_word_tokenized))) for document_word_tokenized in documents_word_tokenized]

In [18]:
print("🔎 Number of words in 1st 10 emails containing alphabetic & numeric characters: %s" % f3[:10])

🔎 Number of words in 1st 10 emails containing alphabetic & numeric characters: [0, 0, 0, 0, 0, 1, 2, 1, 0, 2]


#### F4: The number of words in an email that are found in the spam list

In [19]:
f4 = [reduce(lambda accumulative, spam_term: accumulative + len(re.findall(r"\b" + spam_term + r"\b", document.lower())), spam_term_list, 0) for document in documents]

In [20]:
print("🔎 Number of words in 1st 10 emails that are found in the spam list: %s" % f4[:10])

🔎 Number of words in 1st 10 emails that are found in the spam list: [3, 2, 3, 2, 4, 8, 4, 3, 7, 2]


#### F5: The number of words in an email that have more than 3 syllables

In [21]:
f5 = list(map(lambda counts: len(counts), [list(filter(lambda count: count > 3, document_syllabafied)) for document_syllabafied in documents_syllabafied]))

In [22]:
print("🔎 Number of words in 1st 10 emails that have more than 3 syllables: %s" % f5[:10])

🔎 Number of words in 1st 10 emails that have more than 3 syllables: [30, 11, 5, 1, 4, 35, 10, 4, 4, 13]


#### F6: The average number of syllables of words in an email

In [23]:
f6 = list(map(lambda counts: numpy.mean(counts), documents_syllabafied))

In [24]:
print("🔎 Average number of syllables of words: %s" % f6[:10])

🔎 Average number of syllables of words: [1.6418918918918919, 1.5289017341040463, 1.4152249134948096, 1.3650793650793651, 1.2658227848101267, 1.4833524684270953, 1.4852941176470589, 1.5030674846625767, 1.288, 1.5103448275862068]


#### F7: The number of spelling mistakes in an email

In [25]:
f7 = [len(list(filter(lambda word: not spelling_dictionary.check(word), document_word_tokenized))) for document_word_tokenized in documents_word_tokenized]

In [26]:
print("🔎 Number of misspelled words: %s" % f7[:10])

🔎 Number of misspelled words: [252, 97, 67, 6, 63, 262, 83, 56, 35, 100]


#### Build a feature matrix (list of lists)

In [27]:
feature_matrix = [[f1[i], f2[i], f3[i], f4[i], f5[i], f6[i], f7[i]] for i in range(len(documents))]
print("🔎 Feature matrix of 1st document: %s" % feature_matrix[0])

🔎 Feature matrix of 1st document: [22, 6, 0, 3, 30, 1.6418918918918919, 252]


##### Feed the feature matrix and the labels to any of the sklearn classifiers

In [28]:
## seperate feature matrix to training and test sets
feature_train = feature_matrix[:training_documents_count]
feature_test = feature_matrix[training_documents_count:]

## feed into a classifier
naive_bayes_classifier = MultinomialNB(alpha=1)
naive_bayes_classifier.fit(feature_train, training_labels)

naive_bayes_classifier_predictions = naive_bayes_classifier.predict(feature_test)

naive_bayes_classifier_precision_score = metrics.precision_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_recall_score = metrics.recall_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_f1_score = metrics.f1_score(testing_labels, naive_bayes_classifier_predictions, average='macro')

print("🔎 Naive Bayes classifier precision score: %f" % naive_bayes_classifier_precision_score)
print("🔎 Naive Bayes classifier recall score: %f" % naive_bayes_classifier_recall_score)
print("🔎 Naive Bayes classifier f-score: %f" % naive_bayes_classifier_f1_score)

🔎 Naive Bayes classifier precision score: 0.834477
🔎 Naive Bayes classifier recall score: 0.877297
🔎 Naive Bayes classifier f-score: 0.853492


In [29]:
kneighbors_classifier = KNeighborsClassifier(n_neighbors=3)
kneighbors_classifier.fit(feature_train, training_labels)

kneighbors_classifier_predictions = kneighbors_classifier.predict(feature_test)

kneighbors_classifier_precision_score = metrics.precision_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_recall_score = metrics.recall_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_f1_score = metrics.f1_score(testing_labels, kneighbors_classifier_predictions, average='macro')

print("🔎 K Neighbors classifier precision score: %f" % kneighbors_classifier_precision_score)
print("🔎 K Neighbors classifier recall score: %f" % kneighbors_classifier_recall_score)
print("🔎 K Neighbors classifier f-score: %f" % kneighbors_classifier_f1_score)

🔎 K Neighbors classifier precision score: 0.902082
🔎 K Neighbors classifier recall score: 0.774004
🔎 K Neighbors classifier f-score: 0.819474


In [30]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier.fit(feature_train, training_labels)

random_forest_classifier_predictions = random_forest_classifier.predict(feature_test)

random_forest_classifier_precision_score = metrics.precision_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_recall_score = metrics.recall_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_f1_score = metrics.f1_score(testing_labels, random_forest_classifier_predictions, average='macro')

print("🔎 Random Forest classifier precision score: %f" % random_forest_classifier_precision_score)
print("🔎 Random Forest classifier recall score: %f" % random_forest_classifier_recall_score)
print("🔎 Random Forest classifier f-score: %f" % random_forest_classifier_f1_score)

🔎 Random Forest classifier precision score: 0.942254
🔎 Random Forest classifier recall score: 0.864616
🔎 Random Forest classifier f-score: 0.897578


#### Bonus: X9: TF-IDF

In [31]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(training_documents)

training_tfidf = tfidf_vectorizer.transform(training_documents)
testing_tfidf = tfidf_vectorizer.transform(testing_documents)

In [32]:
naive_bayes_classifier = MultinomialNB(alpha=1)
naive_bayes_classifier.fit(training_tfidf, training_labels)

naive_bayes_classifier_predictions = naive_bayes_classifier.predict(testing_tfidf)

naive_bayes_classifier_precision_score = metrics.precision_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_recall_score = metrics.recall_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_f1_score = metrics.f1_score(testing_labels, naive_bayes_classifier_predictions, average='macro')

print("🔎 Naive Bayes classifier precision score: %f" % naive_bayes_classifier_precision_score)
print("🔎 Naive Bayes classifier recall score: %f" % naive_bayes_classifier_recall_score)
print("🔎 Naive Bayes classifier f-score: %f" % naive_bayes_classifier_f1_score)

🔎 Naive Bayes classifier precision score: 0.920732
🔎 Naive Bayes classifier recall score: 0.526042
🔎 Naive Bayes classifier f-score: 0.506459


In [33]:
kneighbors_classifier = KNeighborsClassifier(n_neighbors=3)
kneighbors_classifier.fit(training_tfidf, training_labels)

kneighbors_classifier_predictions = kneighbors_classifier.predict(testing_tfidf)

kneighbors_classifier_precision_score = metrics.precision_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_recall_score = metrics.recall_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_f1_score = metrics.f1_score(testing_labels, kneighbors_classifier_predictions, average='macro')

print("🔎 K Neighbors classifier precision score: %f" % kneighbors_classifier_precision_score)
print("🔎 K Neighbors classifier recall score: %f" % kneighbors_classifier_recall_score)
print("🔎 K Neighbors classifier f-score: %f" % kneighbors_classifier_f1_score)

🔎 K Neighbors classifier precision score: 0.942866
🔎 K Neighbors classifier recall score: 0.982369
🔎 K Neighbors classifier f-score: 0.961174


In [34]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier.fit(training_tfidf, training_labels)

random_forest_classifier_predictions = random_forest_classifier.predict(testing_tfidf)

random_forest_classifier_precision_score = metrics.precision_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_recall_score = metrics.recall_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_f1_score = metrics.f1_score(testing_labels, random_forest_classifier_predictions, average='macro')

print("🔎 Random Forest classifier precision score: %f" % random_forest_classifier_precision_score)
print("🔎 Random Forest classifier recall score: %f" % random_forest_classifier_recall_score)
print("🔎 Random Forest classifier f-score: %f" % random_forest_classifier_f1_score)

🔎 Random Forest classifier precision score: 0.979167
🔎 Random Forest classifier recall score: 0.890625
🔎 Random Forest classifier f-score: 0.927958
