# Stochastic Gradient Descent Classifier
## CS 72 Final Project
### John Guerrerio
### john.j.guerrerio.26@dartmouth.edu

This notebook contains the code to fine-tune a logistic regression classifier and a support vector machine for the Inbox Guardian classification task.  We use a dataset of the most recent 500 email chains we recieved - see the write up for details.  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well
THREE_CLASSES = False # True for trinary classification task, false for binary

In [None]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

In [None]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

In [None]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

# define our labels based on the THREE_CLASSES flags
if THREE_CLASSES:
  labels = df["Label"].tolist()
else:
  labels = df['Label'].apply(changeLabels).tolist()

print(len(docs))
print(len(labels))

# Logistic Regression
## Tf-idf

In [None]:
# convert all documents to Tf-Idf vectors
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

In [None]:
# generate train, test, validation sets
train, validAndTest, trainLabels, validAndTestLabels = train_test_split(docTermMatrix, labels, test_size=0.3, random_state=RANDOM_STATE)
valid, test, validLabels, testLabels = train_test_split(validAndTest, validAndTestLabels, test_size=0.5, random_state=RANDOM_STATE)

In [None]:
# list of possible values for hyperparameters - we will grid search across all of these values
LR_PENALTY = ["l2", "l1", "elasticnet"] # determines the penalty term
LR_ALPHA = [0.01, 0.001, 0.0002, 0.0001, 0.0005, 0.00001] # determines the learning rate
LR_EPOCHS = [500, 750, 1000, 1250, 1500] # number of epochs for training

# sklearn paritions its own validation set from the train data we pass in an and stops training once performance on
# its validation set no longer improves after 5 epochs
# this early stopping is important to prevent overfitting
EARLY_STOPPING = True
VALIDATION_FRACTION = 0.15 # fraction of training data to use for validation

In [None]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
lr_bestPenalty = ""
lr_bestAplha = 0
lr_bestEpochs = 0
lr_bestF1 = 0

 # grid search
for penalty in LR_PENALTY:
  for alpha in LR_ALPHA:
    for epoch in LR_EPOCHS:
      # we specify log_loss to train a logistic regression classifier (see sklearn documentation)
      LogReg = SGDClassifier(loss = 'log_loss', penalty=penalty, alpha=alpha, max_iter=epoch, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
      LogReg.fit(train, trainLabels)

      # evaluation on the validation set
      predictions = LogReg.predict(valid)
      f1 = f1_score(validLabels, predictions, average="macro")

      # if we get a better validation f1, update the best f1/hyperparameters we've seen
      if f1 > lr_bestF1:
        lr_bestF1 = f1
        lr_bestPenalty = penalty
        lr_bestAplha = alpha
        lr_bestEpochs = epoch

# best performing hyperparameters
print("Best f1: " + str(lr_bestF1))
print("Best epochs: " + str(lr_bestEpochs))
print("Best alpha: " + str(lr_bestAplha))
print("Best penalty: " + str(lr_bestPenalty))

In [None]:
# train a logistic regression classifier with the best-performing hyperparameters
LogReg = SGDClassifier(loss = 'log_loss', penalty=lr_bestPenalty, alpha=lr_bestAplha, max_iter=lr_bestEpochs, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
LogReg.fit(train, trainLabels)
predictions = LogReg.predict(test)

In [None]:
# evaluate logistic regression classifier on the test set
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))

# SVM
## Tf-Idf

In [None]:
# list of possible values for hyperparameters - we will grid search across all of these values
PENALTY = ["l2", "l1", "elasticnet"] # determines the penalty term
ALPHA = [0.01, 0.001, 0.0002, 0.0001, 0.0005, 0.00001] # determines the learning rate
EPOCHS = [500, 750, 1000, 1250, 1500] # number of epochs for training

# sklearn paritions its own validation set from the train data we pass in an and stops training once performance on
# its validation set no longer improves after 5 epochs
# this early stopping is important to prevent overfitting
EARLY_STOPPING = True
VALIDATION_FRACTION = 0.15 # fraction of training data to use for validation

In [None]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
bestPenalty = "l2"
bestAplha = 0.0001
bestEpochs = 1000
bestF1 = 0

# grid search
for penalty in PENALTY:
  for alpha in ALPHA:
    for epoch in EPOCHS:
      # the default loss function for SGDClassifier generates an SVM (see sklearn documentation)
      SVM = SGDClassifier(penalty=penalty, alpha=alpha, max_iter=epoch, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
      SVM.fit(train, trainLabels)

      # evalaute on validation set
      predictions = SVM.predict(valid)
      f1 = f1_score(validLabels, predictions, average="macro")

      # if we get a better validation f1, update the best f1/hyperparameters we've seen
      if f1 > bestF1:
        bestF1 = f1
        bestPenalty = penalty
        bestAplha = alpha
        bestEpochs = epoch

# best performing hyperparameters
print("Best f1: " + str(bestF1))
print("Best epochs: " + str(bestEpochs))
print("Best alpha: " + str(bestAplha))
print("Best penalty: " + str(bestPenalty))

In [None]:
# train a SVM with the best-performing hyperparameters
SVM = SGDClassifier(penalty=bestPenalty, alpha=bestAplha, max_iter=bestEpochs, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE) # this is a linear SVM
SVM.fit(train, trainLabels)
predictions = SVM.predict(test)

In [None]:
# evaluation the SVM on the test set
print(classification_report(testLabels, predictions))

In [None]:
print(confusion_matrix(testLabels, predictions))