# Spam Detector

In [1]:
# for filesystem access
import os
# for Unix filename pattern matching
import fnmatch
# for data analysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
LINGSPAM_BARE_DATASET_PATH = "datasets/lingspam_public/bare"

In [3]:
documents = []
labels = []

In [4]:
def is_spam_file_name(file_name):
    return fnmatch.fnmatchcase(file_name, 'spmsg*')

## Reading and Preprocessing Data

### Read all the emails in the ten folders & save the labels (spam/not spam, or 0/1) of each email to a list

In [5]:
for root, dirs, file_names in os.walk(LINGSPAM_BARE_DATASET_PATH):
    for file_name in fnmatch.filter(file_names, '*.txt'):
        with open(os.path.join(root, file_name), 'r') as file:
            documents.append(file.read())
            labels.append(1 if is_spam_file_name(file_name) else 0)

In [6]:
documents_length = len(documents)

if documents_length > 0:
    print("✅ Read %i documents" % len(documents))
else:
    print("❌ Could not read any documents")

✅ Read 2893 documents


### Split the emails & labels into 80% training & 20% testing

In [7]:
training_documents_count = round(documents_length * 0.8)

training_documents = documents[:training_documents_count]
training_labels = labels[:training_documents_count]

testing_documents = documents[training_documents_count:]
testing_labels = labels[training_documents_count:]

### Fit and transform the training emails & transform the testing emails using a CountVectorizer

In [8]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(training_documents)

training_document_term_matrix = count_vectorizer.transform(training_documents)
testing_document_term_matrix = count_vectorizer.transform(testing_documents)

## Scikit-Learn Classifiers
##### For each classifier, print the precision, recall and f-score on the testing data

### Multinomial Naive Bayes

In [9]:
naive_bayes_classifier = MultinomialNB(alpha=1) # alpha: additive smoothing parameter
naive_bayes_classifier.fit(training_document_term_matrix, training_labels)

naive_bayes_classifier_predictions = naive_bayes_classifier.predict(testing_document_term_matrix)

naive_bayes_classifier_precision_score = metrics.precision_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_recall_score = metrics.recall_score(testing_labels, naive_bayes_classifier_predictions, average='macro')
naive_bayes_classifier_f1_score = metrics.f1_score(testing_labels, naive_bayes_classifier_predictions, average='macro')

print("🔎 Naive Bayes classifier precision score: %f" % naive_bayes_classifier_precision_score)
print("🔎 Naive Bayes classifier recall score: %f" % naive_bayes_classifier_recall_score)
print("🔎 Naive Bayes classifier f-score: %f" % naive_bayes_classifier_f1_score)

🔎 Naive Bayes classifier precision score: 0.970874
🔎 Naive Bayes classifier recall score: 0.993776
🔎 Naive Bayes classifier f-score: 0.981868


### K Neighbors Classifier

In [10]:
kneighbors_classifier = KNeighborsClassifier(n_neighbors=3)
kneighbors_classifier.fit(training_document_term_matrix, training_labels)

kneighbors_classifier_predictions = kneighbors_classifier.predict(testing_document_term_matrix)

kneighbors_classifier_precision_score = metrics.precision_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_recall_score = metrics.recall_score(testing_labels, kneighbors_classifier_predictions, average='macro')
kneighbors_classifier_f1_score = metrics.f1_score(testing_labels, kneighbors_classifier_predictions, average='macro')

print("🔎 K Neighbors classifier precision score: %f" % kneighbors_classifier_precision_score)
print("🔎 K Neighbors classifier recall score: %f" % kneighbors_classifier_recall_score)
print("🔎 K Neighbors classifier f-score: %f" % kneighbors_classifier_f1_score)

🔎 K Neighbors classifier precision score: 0.933640
🔎 K Neighbors classifier recall score: 0.908190
🔎 K Neighbors classifier f-score: 0.920282


### Random Forest Classifier
##### you can set random_state=0

In [11]:
random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier.fit(training_document_term_matrix, training_labels)

random_forest_classifier_predictions = random_forest_classifier.predict(testing_document_term_matrix)

random_forest_classifier_precision_score = metrics.precision_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_recall_score = metrics.recall_score(testing_labels, random_forest_classifier_predictions, average='macro')
random_forest_classifier_f1_score = metrics.f1_score(testing_labels, random_forest_classifier_predictions, average='macro')

print("🔎 Random Forest classifier precision score: %f" % random_forest_classifier_precision_score)
print("🔎 Random Forest classifier recall score: %f" % random_forest_classifier_recall_score)
print("🔎 Random Forest classifier f-score: %f" % random_forest_classifier_f1_score)

🔎 Random Forest classifier precision score: 0.977228
🔎 Random Forest classifier recall score: 0.881443
🔎 Random Forest classifier f-score: 0.921097


## Classifying using Readability Features

Rather than using the whole text content of an email, some characteristic features can be extracted per email, that will be fed to the classifier. Extract some features. The features are:

    a) F1: The number of sentences in an email.
    b) F2: The number of verbs in an email.
    c) F3: The number of words containing both numeric and alphabetical characters.
    d) F4: The number of words in an email that are found in the spam list.
    e) F5: The number of words in an email that have more than 3 syllables.
    f) F6: The average number of syllables of words in an email.
    
For F2, you can find useful code in Lab Assignment 5 solution on the MET website. For F4, you will be checking how many words in a given email are found in a spam word-list. The word-list you will be using can be found here. For F5 and F6, you can use the library Pyphen (with lang=’en_GB’).

The steps are:

    a) Create a list for every feature, where every element is the feature value of a given email (or use a
    dictionary, key is feature name, value is feature list).
    b) Build a feature matrix (list of lists), where every row corresponds to an email, and every column
    corresponds to a feature value of this email.
    c) Feed the feature matrix and the labels to any of the sklearn classifiers.
    
On the MET website, you will find a file titled “feature-construction”. This is an example of building a
feature matrix (steps “a” and “b”). Note that this is just a sample, the documents and the features to be
extracted will be different in the project.

##### For the classifier, print the precision, recall and f-score on the testing data.

#### F1: The number of sentences in an email.

In [12]:
import nltk.tokenize as tokenizer

f1 = [len(tokenizer.sent_tokenize(document)) for document in documents]
print(f1[0])

42


#### F2: The number of verbs in an email.

In [13]:
from nltk import pos_tag, word_tokenize

docstags = [pos_tag(word_tokenize(document)) for document in documents]

In [14]:
def verbsNr (tags):
    filtered = set() 
    for (word, tag) in tags:
        if(tag == 'VB'):
            filtered.add(word)
    return len(filtered)

In [15]:
f2 = [verbsNr (tags) for tags in docstags]
print(f2[0])

19


#### F3: The number of words containing both numeric and alphabetical characters.

In [None]:
reg = '/^\w+$/'

#### Build a feature matrix (list of lists)

In [16]:
feat_matrix = [[f1[i], f2[i]] for i in range(len(documents))]
print(feat_matrix[0])

[42, 19]
