# Naive Bayes Classifier
\
This notebook contains the code to train a naive bayes for the Inbox Guardian classification task using both bad-of-words and Tf-Idf features.  We use a dataset of the most recent 500 email chains we recieved.  

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# functions to convert numerical labels to class names - helps with error analysis outputs
def interpretBinaryLabel(x):
  if x == 1:
    return "relevant"
  else:
    return "irrelevant"

def interpretTrinaryLabel(x):
  if x == 0:
    return "irrelevant"
  elif x == 1:
    return "normal"
  else:
    return "urgent"

In [3]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well

In [4]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

Unnamed: 0,Sender,Subject,Body,Meta,Full,Label
0,Can't Sell Culture Comedy Collective <Can't.Se...,CANT SELL CULTURE COMEDY SHOW AXA WEDNESDAY 8PM,Dear Gorgeous Gorgeous Gorgeous It’s been such...,Can't Sell Culture Comedy Collective <Can't.Se...,Can't Sell Culture Comedy Collective <Can't.Se...,0
1,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Standup Comedy Open Mic Wednesday @9PM,Did you catch the comedy bug after watching Ma...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,0
2,Dartmouth Libertarians <Dartmouth.Libertarians...,YOU OWE OTHERS,[cid:8efe9b1a-1318-4bcd-a7c9-ab101c844447],Dartmouth Libertarians <Dartmouth.Libertarians...,Dartmouth Libertarians <Dartmouth.Libertarians...,0
3,Central Americans United Student Association <...,🇸🇻2024 Salvadorian Elections 🇸🇻- A Conversatio...,Join us for a conversation with … ~ Professor...,Central Americans United Student Association <...,Central Americans United Student Association <...,0
4,Hop Fellows <Hop.Fellows@dartmouth.edu>,Hanunder N.H. - Primer on being cool at Dartm...,[https://lh7-us.googleusercontent.com/bIftg2CE...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,0


In [5]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

labels = df["Label"].tolist()

print(len(docs))
print(len(labels))

536
536


# Multinomial Naive Bayes
## Bag of Words vectors
## Three Classes

In [6]:
# generate BOW vectors for each document
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
vectorizer = CountVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = vectorizer.fit_transform(docs)

In [8]:
docTermMatrix = processed.toarray()

# generate train, test sets
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, labels, test_size=0.2, random_state=RANDOM_STATE)

In [9]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

428
428
108
108


In [10]:
classifier = MultinomialNB(alpha = 1.0) # add one smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [11]:
# evaluate classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.94      0.60      0.73        57
           1       0.53      0.88      0.66        32
           2       0.68      0.68      0.68        19

    accuracy                           0.69       108
   macro avg       0.72      0.72      0.69       108
weighted avg       0.78      0.69      0.70       108



In [12]:
print(confusion_matrix(testLabels, predictions))

[[34 19  4]
 [ 2 28  2]
 [ 0  6 13]]


# Multinomial Naive Bayes
## BOW
## Two Classes

In [13]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

# convert trinary labels to binary
new_labels = df['Label'].apply(changeLabels).tolist()
print(new_labels)

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 

In [14]:
# generate train, test sets
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, new_labels, test_size=0.2, random_state=RANDOM_STATE)

In [15]:
classifier = MultinomialNB(alpha = 1.0) # add one smoothing

# train the naive bayes classifier
classifier.fit(train, trainLabels)

In [16]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.95      0.67      0.78        57
           1       0.72      0.96      0.82        51

    accuracy                           0.81       108
   macro avg       0.84      0.81      0.80       108
weighted avg       0.84      0.81      0.80       108



In [17]:
print(confusion_matrix(testLabels, predictions))

[[38 19]
 [ 2 49]]


# Multinomial Naive Bayes
## Tf-IDF
## Three Classes

In [18]:
# Tf-Idf vectorizer - converts docs to tf-idf vectors
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)

In [20]:
docTermMatrix = processed.toarray()

# generate train, test sets for tf-idf features
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, labels, test_size=0.2, random_state=RANDOM_STATE)

# copy of train/test sets with document text (as opposed to tf-idf vectors) for error analysis - random state ensures the data is split the same way every time
trainText, testText, _, _ = train_test_split(docs, labels, test_size=0.2, random_state=RANDOM_STATE)

In [21]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

428
428
108
108


In [22]:
classifier = MultinomialNB(alpha = 0.1) # add 0.1 smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [23]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93        57
           1       0.69      0.84      0.76        32
           2       0.86      0.63      0.73        19

    accuracy                           0.84       108
   macro avg       0.83      0.80      0.81       108
weighted avg       0.85      0.84      0.84       108



In [24]:
print(confusion_matrix(testLabels, predictions))

[[52  5  0]
 [ 3 27  2]
 [ 0  7 12]]


In [None]:
# we perform error analysis on the if-idf naive bayes models as they are the best perfoming model across all the expiriments we ran
# see the write-up for details

# print the text, predicted label, and actual label of all misclassified dcuments
for i in range(len(predictions)):
  if predictions[i] != testLabels[i]:
    print("Email: " + str(testText[i]))
    print("Predicted Label: " + interpretTrinaryLabel(predictions[i]))
    print("Actual Label: " + interpretTrinaryLabel(testLabels[i]))
    print()

# note this output has been removed to protect the privacy of my emails - several examples are discussed in my write up

# Multinomial Naive Bayes
## Tf-IDF
## Two Classes

In [26]:
# Tf-Idf vectorizer - converts docs to tf-idf vectors
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

# generate train, test sets for tf-idf features
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, new_labels, test_size=0.2, random_state=RANDOM_STATE)

# copy of train/test sets with document text (as opposed to tf-idf vectors) for error analysis - random state ensures the data is split the same way every time
trainText, testText, _, _ = train_test_split(docs, new_labels, test_size=0.2, random_state=RANDOM_STATE) # copy we can read the text of for error analysis - random state ensures the data is split the same way every time

In [27]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

428
428
108
108


In [28]:
classifier = MultinomialNB(alpha = 0.1) # add 0.1 smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [29]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92        57
           1       0.89      0.94      0.91        51

    accuracy                           0.92       108
   macro avg       0.92      0.92      0.92       108
weighted avg       0.92      0.92      0.92       108



In [30]:
print(confusion_matrix(testLabels, predictions))

[[51  6]
 [ 3 48]]


In [None]:
# we perform error analysis on the if-idf naive bayes models as they are the best perfoming model across all the expiriments we ran
# see the write-up for details

# print the text, predicted label, and actual label of all misclassified dcuments
for i in range(len(predictions)):
  if predictions[i] != testLabels[i]:
    print("Email: " + str(testText[i]))
    print("Predicted Label: " + interpretBinaryLabel(predictions[i]))
    print("Actual Label: " + interpretBinaryLabel(testLabels[i]))
    print()

# note this output has been removed to protect the privacy of my emails - several examples are discussed in my write up