# Naive Bayes Classifier
## CS 72 Final Project
### John Guerrerio
### john.j.guerrerio.26@dartmouth.edu

This notebook contains the code to fine-tune a naive bayes for the Inbox Guardian classification task using both bad-of-words and Tf-Idf features.  We use a dataset of the most recent 500 email chains we recieved - see the write up for details.  

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# functions to convert numerical labels to class names - helps with error analysis outputs
def interpretBinaryLabel(x):
  if x == 1:
    return "relevant"
  else:
    return "irrelevant"

def interpretTrinaryLabel(x):
  if x == 0:
    return "irrelevant"
  elif x == 1:
    return "normal"
  else:
    return "urgent"

In [None]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well

In [None]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

In [None]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

labels = df["Label"].tolist()

print(len(docs))
print(len(labels))

# Multinomial Naive Bayes
## Bag of Words vectors
## Three Classes

In [None]:
# generate BOW vectors for each document
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
vectorizer = CountVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = vectorizer.fit_transform(docs)

In [None]:
print(vectorizer.vocabulary_)

In [None]:
docTermMatrix = processed.toarray()

# generate train, test sets
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, labels, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

In [None]:
classifier = MultinomialNB(alpha = 1.0) # add one smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [None]:
# evaluate classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))

# Multinomial Naive Bayes
## BOW
## Two Classes

In [None]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

# convert trinary labels to binary
new_labels = df['Label'].apply(changeLabels).tolist()
print(new_labels)

In [None]:
# generate train, test sets
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, new_labels, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
classifier = MultinomialNB(alpha = 1.0) # add one smoothing

# train the naive bayes classifier
classifier.fit(train, trainLabels)

In [None]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))

# Multinomial Naive Bayes
## Tf-IDF
## Three Classes

In [None]:
# Tf-Idf vectorizer - converts docs to tf-idf vectors
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)

In [None]:
print(tfidfVectorizer.vocabulary_)

In [None]:
docTermMatrix = processed.toarray()

# generate train, test sets for tf-idf features
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, labels, test_size=0.2, random_state=RANDOM_STATE)

# copy of train/test sets with document text (as opposed to tf-idf vectors) for error analysis - random state ensures the data is split the same way every time
trainText, testText, _, _ = train_test_split(docs, labels, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

In [None]:
classifier = MultinomialNB(alpha = 0.1) # add 0.1 smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [None]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))

In [None]:
# we perform error analysis on the if-idf naive bayes models as they are the best perfoming model across all the expiriments we ran
# see the write-up for details

# print the text, predicted label, and actual label of all misclassified dcuments
for i in range(len(predictions)):
  if predictions[i] != testLabels[i]:
    print("Email: " + str(testText[i]))
    print("Predicted Label: " + interpretTrinaryLabel(predictions[i]))
    print("Actual Label: " + interpretTrinaryLabel(testLabels[i]))
    print()

# Multinomial Naive Bayes
## Tf-IDF
## Two Classes

In [None]:
# Tf-Idf vectorizer - converts docs to tf-idf vectors
# the vectorizer strips accents, lowercases, removes stopwords, and only considers the most frequent 7500 words
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

# generate train, test sets for tf-idf features
train, test, trainLabels, testLabels = train_test_split(docTermMatrix, new_labels, test_size=0.2, random_state=RANDOM_STATE)

# copy of train/test sets with document text (as opposed to tf-idf vectors) for error analysis - random state ensures the data is split the same way every time
trainText, testText, _, _ = train_test_split(docs, new_labels, test_size=0.2, random_state=RANDOM_STATE) # copy we can read the text of for error analysis - random state ensures the data is split the same way every time

In [None]:
print(len(train))
print(len(trainLabels))

print(len(test))
print(len(testLabels))

In [None]:
classifier = MultinomialNB(alpha = 0.1) # add 0.1 smoothing

# train the classifier
classifier.fit(train, trainLabels)

In [None]:
# evaluate the classifier on the test set
predictions = classifier.predict(test)
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))

In [None]:
# we perform error analysis on the if-idf naive bayes models as they are the best perfoming model across all the expiriments we ran
# see the write-up for details

# print the text, predicted label, and actual label of all misclassified dcuments
for i in range(len(predictions)):
  if predictions[i] != testLabels[i]:
    print("Email: " + str(testText[i]))
    print("Predicted Label: " + interpretBinaryLabel(predictions[i]))
    print("Actual Label: " + interpretBinaryLabel(testLabels[i]))
    print()