# Random Forest Classifier
## CS 72 Final Project
### John Guerrerio
### john.j.guerrerio.26@dartmouth.edu

This notebook contains the code to fine-tune a random forest classifier for the Inbox Guardian classification task.  We use a dataset of the most recent 500 email chains we recieved - see the write up for details.  

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well
THREE_CLASSES = True # True for trinary classification task, false for binary

In [None]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

In [None]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

In [None]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

# define our labels based on the THREE_CLASSES flags
if THREE_CLASSES:
  labels = df["Label"].tolist()
else:
  labels = df['Label'].apply(changeLabels).tolist()

print(len(docs))
print(len(labels))

In [None]:
# convert all documents to Tf-Idf vectors
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

In [None]:
# generate train, test, validation sets
train, validAndTest, trainLabels, validAndTestLabels = train_test_split(docTermMatrix, labels, test_size=0.3, random_state=RANDOM_STATE)
valid, test, validLabels, testLabels = train_test_split(validAndTest, validAndTestLabels, test_size=0.5, random_state=RANDOM_STATE)

In [None]:
# list of possible values for hyperparameters - we will grid search across all of these values
NUM_TREES = [50, 75, 100, 125, 150] # number of decision trees in the forest
CRITERION = ["gini", "entropy", "log_loss"] # function to determine homegeneity
MIN_SAMPLES_SPLIT = [2, 3, 4] # minimum number of samples required to split a node
MAX_FEATURES = ["sqrt", "log2"] # function to consider the number of features to look for when looking for a split

In [None]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
bestF1 = 0
bestTrees = 0
bestCriterion = ""
bestMinSamplesSplit = 0
bestMaxFeatures = ""

 # grid search
for tree in NUM_TREES:
  for criterion in CRITERION:
    for split in MIN_SAMPLES_SPLIT:
      for features in MAX_FEATURES:
        # train a random forest classifier
        forest = RandomForestClassifier(n_estimators=tree, criterion=criterion, min_samples_split=split, max_features=features, random_state = RANDOM_STATE)
        forest.fit(train, trainLabels)

        # evaluation on the validation set
        predictions = forest.predict(valid)
        f1 = f1_score(validLabels, predictions, average="macro")

        # if we get a better validation f1, update the best f1/hyperparameters we've seen
        if f1 > bestF1:
          bestF1 = f1
          bestTrees = tree
          bestCriterion = criterion
          bestMinSamplesSplit = split
          bestMaxFeatures = features

# best performing hyperparameters
print("Best f1: " + str(f1))
print("Best number of trees: " + str(bestTrees))
print("Best criterion: " + str(bestCriterion))
print("Best minimum number samples to split a tree: " + str(bestMinSamplesSplit))
print("Best max features determiner: " + str(bestMaxFeatures))

In [None]:
# train a random forest classifier on the best hyperparameters we saw
forest = RandomForestClassifier(n_estimators=bestTrees, criterion=bestCriterion, min_samples_split=bestMinSamplesSplit, max_features=bestMaxFeatures)
forest.fit(train, trainLabels)

In [None]:
# evaluate that random forest classifier on the test set
predictions = forest.predict(test)
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))