# Using Naive Bayes to categorize emails

## Get data into the correct format

In [1]:
import pickle
import os

with open("../data/email_authors.pkl", 'rb') as authors_file, open("../data/word_data.pkl", 'rb') as word_file:
    email_authors = pickle.load(authors_file)
    word_data = pickle.load(word_file)

## Split into training and test sets

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
features_train, features_test, labels_train, labels_test = train_test_split(word_data, email_authors, test_size=0.1, random_state=42)

In [4]:
# tokenize emails
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed = vectorizer.transform(features_test)

In [5]:
# only use top 10% of features
selector = SelectPercentile(percentile=10)
features_train_transformed = selector.fit_transform(features_train_transformed, labels_train).toarray()
features_test_transformed = selector.transform(features_test_transformed).toarray()

## Train Gaussian Naive Bayes model

In [6]:
from sklearn.naive_bayes import GaussianNB

In [7]:
gnb = GaussianNB()
gnb.fit(features_train_transformed, labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Measure effectiveness

In [8]:
labels_pred = gnb.predict(features_test_transformed)

In [9]:
from sklearn.metrics import accuracy_score

print("Number of mislabeled points out of a total %d points : %d" % (features_test_transformed.shape[0], (labels_test != labels_pred).sum()))
print("Accuracy of:", accuracy_score(labels_test, labels_pred))

Number of mislabeled points out of a total 1758 points : 47
Accuracy of: 0.9732650739476678
