# Random Forest Classifier
\
This notebook contains the code to train a random forest classifier for the Inbox Guardian classification task.  We use a dataset of the most recent 500 email chains we recieved.  

In [4]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well
THREE_CLASSES = True # True for trinary classification task, false for binary

In [6]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

Unnamed: 0,Sender,Subject,Body,Meta,Full,Label
0,Can't Sell Culture Comedy Collective <Can't.Se...,CANT SELL CULTURE COMEDY SHOW AXA WEDNESDAY 8PM,Dear Gorgeous Gorgeous Gorgeous It’s been such...,Can't Sell Culture Comedy Collective <Can't.Se...,Can't Sell Culture Comedy Collective <Can't.Se...,0
1,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Standup Comedy Open Mic Wednesday @9PM,Did you catch the comedy bug after watching Ma...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,0
2,Dartmouth Libertarians <Dartmouth.Libertarians...,YOU OWE OTHERS,[cid:8efe9b1a-1318-4bcd-a7c9-ab101c844447],Dartmouth Libertarians <Dartmouth.Libertarians...,Dartmouth Libertarians <Dartmouth.Libertarians...,0
3,Central Americans United Student Association <...,🇸🇻2024 Salvadorian Elections 🇸🇻- A Conversatio...,Join us for a conversation with … ~ Professor...,Central Americans United Student Association <...,Central Americans United Student Association <...,0
4,Hop Fellows <Hop.Fellows@dartmouth.edu>,Hanunder N.H. - Primer on being cool at Dartm...,[https://lh7-us.googleusercontent.com/bIftg2CE...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,0


In [7]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

In [8]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

# define our labels based on the THREE_CLASSES flags
if THREE_CLASSES:
  labels = df["Label"].tolist()
else:
  labels = df['Label'].apply(changeLabels).tolist()

print(len(docs))
print(len(labels))

536
536


In [9]:
# convert all documents to Tf-Idf vectors
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

In [10]:
# generate train, test, validation sets
train, validAndTest, trainLabels, validAndTestLabels = train_test_split(docTermMatrix, labels, test_size=0.3, random_state=RANDOM_STATE)
valid, test, validLabels, testLabels = train_test_split(validAndTest, validAndTestLabels, test_size=0.5, random_state=RANDOM_STATE)

In [11]:
# list of possible values for hyperparameters - we will grid search across all of these values
NUM_TREES = [50, 75, 100, 125, 150] # number of decision trees in the forest
CRITERION = ["gini", "entropy", "log_loss"] # function to determine homegeneity
MIN_SAMPLES_SPLIT = [2, 3, 4] # minimum number of samples required to split a node
MAX_FEATURES = ["sqrt", "log2"] # function to consider the number of features to look for when looking for a split

In [12]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
bestF1 = 0
bestTrees = 0
bestCriterion = ""
bestMinSamplesSplit = 0
bestMaxFeatures = ""

 # grid search
for tree in NUM_TREES:
  for criterion in CRITERION:
    for split in MIN_SAMPLES_SPLIT:
      for features in MAX_FEATURES:
        # train a random forest classifier
        forest = RandomForestClassifier(n_estimators=tree, criterion=criterion, min_samples_split=split, max_features=features, random_state = RANDOM_STATE)
        forest.fit(train, trainLabels)

        # evaluation on the validation set
        predictions = forest.predict(valid)
        f1 = f1_score(validLabels, predictions, average="macro")

        # if we get a better validation f1, update the best f1/hyperparameters we've seen
        if f1 > bestF1:
          bestF1 = f1
          bestTrees = tree
          bestCriterion = criterion
          bestMinSamplesSplit = split
          bestMaxFeatures = features

# best performing hyperparameters
print("Best f1: " + str(f1))
print("Best number of trees: " + str(bestTrees))
print("Best criterion: " + str(bestCriterion))
print("Best minimum number samples to split a tree: " + str(bestMinSamplesSplit))
print("Best max features determiner: " + str(bestMaxFeatures))

Best f1: 0.8277777777777778
Best number of trees: 100
Best criterion: gini
Best minimum number samples to split a tree: 3
Best max features determiner: sqrt


In [13]:
# train a random forest classifier on the best hyperparameters we saw
forest = RandomForestClassifier(n_estimators=bestTrees, criterion=bestCriterion, min_samples_split=bestMinSamplesSplit, max_features=bestMaxFeatures)
forest.fit(train, trainLabels)

In [14]:
# evaluate that random forest classifier on the test set
predictions = forest.predict(test)
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87        45
           1       0.86      0.82      0.84        22
           2       1.00      0.57      0.73        14

    accuracy                           0.84        81
   macro avg       0.89      0.77      0.81        81
weighted avg       0.85      0.84      0.83        81



In [15]:
print(confusion_matrix(testLabels, predictions))

[[42  3  0]
 [ 4 18  0]
 [ 6  0  8]]
