# Stochastic Gradient Descent Classifier
\
This notebook contains the code to train a logistic regression classifier and a support vector machine for the Inbox Guardian classification task.  We use a dataset of the most recent 500 email chains we recieved.  

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
RANDOM_STATE = 42 # random seed to ensure results are reproducible
META = False # True if we only want to show the model senders and subjects, false if we want to pass in email body as well
THREE_CLASSES = False # True for trinary classification task, false for binary

In [6]:
df = pd.read_csv('fullDataset.csv')
# Shuffles the dataset, as it was ordered by label during construction
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df.head()

Unnamed: 0,Sender,Subject,Body,Meta,Full,Label
0,Can't Sell Culture Comedy Collective <Can't.Se...,CANT SELL CULTURE COMEDY SHOW AXA WEDNESDAY 8PM,Dear Gorgeous Gorgeous Gorgeous It’s been such...,Can't Sell Culture Comedy Collective <Can't.Se...,Can't Sell Culture Comedy Collective <Can't.Se...,0
1,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Standup Comedy Open Mic Wednesday @9PM,Did you catch the comedy bug after watching Ma...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,Dartmouth Comedy Network <Dartmouth.Comedy.Net...,0
2,Dartmouth Libertarians <Dartmouth.Libertarians...,YOU OWE OTHERS,[cid:8efe9b1a-1318-4bcd-a7c9-ab101c844447],Dartmouth Libertarians <Dartmouth.Libertarians...,Dartmouth Libertarians <Dartmouth.Libertarians...,0
3,Central Americans United Student Association <...,🇸🇻2024 Salvadorian Elections 🇸🇻- A Conversatio...,Join us for a conversation with … ~ Professor...,Central Americans United Student Association <...,Central Americans United Student Association <...,0
4,Hop Fellows <Hop.Fellows@dartmouth.edu>,Hanunder N.H. - Primer on being cool at Dartm...,[https://lh7-us.googleusercontent.com/bIftg2CE...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,Hop Fellows <Hop.Fellows@dartmouth.edu> Hanund...,0


In [7]:
# Converts all "urgent" labels to "relevant"
# Allows us to collpase the trinary classification task into a relevant/irrelevant binary classification task
def changeLabels(x):
  if x == 2:
    return 1
  else:
    return x

In [8]:
# define our docs based on the META flag
if META:
  docs = df["Meta"].tolist()
else:
  docs = df["Full"].tolist()

# define our labels based on the THREE_CLASSES flags
if THREE_CLASSES:
  labels = df["Label"].tolist()
else:
  labels = df['Label'].apply(changeLabels).tolist()

print(len(docs))
print(len(labels))

536
536


# Logistic Regression
## Tf-idf

In [9]:
# convert all documents to Tf-Idf vectors
tfidfVectorizer = TfidfVectorizer(strip_accents='unicode', lowercase = True, stop_words='english', max_features=7500)
processed = tfidfVectorizer.fit_transform(docs)
docTermMatrix = processed.toarray()

In [10]:
# generate train, test, validation sets
train, validAndTest, trainLabels, validAndTestLabels = train_test_split(docTermMatrix, labels, test_size=0.3, random_state=RANDOM_STATE)
valid, test, validLabels, testLabels = train_test_split(validAndTest, validAndTestLabels, test_size=0.5, random_state=RANDOM_STATE)

In [11]:
# list of possible values for hyperparameters - we will grid search across all of these values
LR_PENALTY = ["l2", "l1", "elasticnet"] # determines the penalty term
LR_ALPHA = [0.01, 0.001, 0.0002, 0.0001, 0.0005, 0.00001] # determines the learning rate
LR_EPOCHS = [500, 750, 1000, 1250, 1500] # number of epochs for training

# sklearn paritions its own validation set from the train data we pass in an and stops training once performance on
# its validation set no longer improves after 5 epochs
# this early stopping is important to prevent overfitting
EARLY_STOPPING = True
VALIDATION_FRACTION = 0.15 # fraction of training data to use for validation

In [12]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
lr_bestPenalty = ""
lr_bestAplha = 0
lr_bestEpochs = 0
lr_bestF1 = 0

 # grid search
for penalty in LR_PENALTY:
  for alpha in LR_ALPHA:
    for epoch in LR_EPOCHS:
      # we specify log_loss to train a logistic regression classifier (see sklearn documentation)
      LogReg = SGDClassifier(loss = 'log_loss', penalty=penalty, alpha=alpha, max_iter=epoch, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
      LogReg.fit(train, trainLabels)

      # evaluation on the validation set
      predictions = LogReg.predict(valid)
      f1 = f1_score(validLabels, predictions, average="macro")

      # if we get a better validation f1, update the best f1/hyperparameters we've seen
      if f1 > lr_bestF1:
        lr_bestF1 = f1
        lr_bestPenalty = penalty
        lr_bestAplha = alpha
        lr_bestEpochs = epoch

# best performing hyperparameters
print("Best f1: " + str(lr_bestF1))
print("Best epochs: " + str(lr_bestEpochs))
print("Best alpha: " + str(lr_bestAplha))
print("Best penalty: " + str(lr_bestPenalty))

Best f1: 0.9124863259884357
Best epochs: 500
Best alpha: 0.001
Best penalty: l2


In [13]:
# train a logistic regression classifier with the best-performing hyperparameters
LogReg = SGDClassifier(loss = 'log_loss', penalty=lr_bestPenalty, alpha=lr_bestAplha, max_iter=lr_bestEpochs, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
LogReg.fit(train, trainLabels)
predictions = LogReg.predict(test)

In [14]:
# evaluate logistic regression classifier on the test set
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90        45
           1       0.89      0.86      0.87        36

    accuracy                           0.89        81
   macro avg       0.89      0.89      0.89        81
weighted avg       0.89      0.89      0.89        81



In [15]:
print(confusion_matrix(testLabels, predictions))

[[41  4]
 [ 5 31]]


# SVM
## Tf-Idf

In [16]:
# list of possible values for hyperparameters - we will grid search across all of these values
PENALTY = ["l2", "l1", "elasticnet"] # determines the penalty term
ALPHA = [0.01, 0.001, 0.0002, 0.0001, 0.0005, 0.00001] # determines the learning rate
EPOCHS = [500, 750, 1000, 1250, 1500] # number of epochs for training

# sklearn paritions its own validation set from the train data we pass in an and stops training once performance on
# its validation set no longer improves after 5 epochs
# this early stopping is important to prevent overfitting
EARLY_STOPPING = True
VALIDATION_FRACTION = 0.15 # fraction of training data to use for validation

In [17]:
# We grid search to optimize macro-averaged F1 on the validation set

# Best validation f1/hyperparameters we've seen so far
bestPenalty = "l2"
bestAplha = 0.0001
bestEpochs = 1000
bestF1 = 0

# grid search
for penalty in PENALTY:
  for alpha in ALPHA:
    for epoch in EPOCHS:
      # the default loss function for SGDClassifier generates an SVM (see sklearn documentation)
      SVM = SGDClassifier(penalty=penalty, alpha=alpha, max_iter=epoch, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE)
      SVM.fit(train, trainLabels)

      # evalaute on validation set
      predictions = SVM.predict(valid)
      f1 = f1_score(validLabels, predictions, average="macro")

      # if we get a better validation f1, update the best f1/hyperparameters we've seen
      if f1 > bestF1:
        bestF1 = f1
        bestPenalty = penalty
        bestAplha = alpha
        bestEpochs = epoch

# best performing hyperparameters
print("Best f1: " + str(bestF1))
print("Best epochs: " + str(bestEpochs))
print("Best alpha: " + str(bestAplha))
print("Best penalty: " + str(bestPenalty))

Best f1: 0.9124863259884357
Best epochs: 500
Best alpha: 0.0005
Best penalty: elasticnet


In [18]:
# train a SVM with the best-performing hyperparameters
SVM = SGDClassifier(penalty=bestPenalty, alpha=bestAplha, max_iter=bestEpochs, early_stopping=EARLY_STOPPING, validation_fraction=VALIDATION_FRACTION, random_state=RANDOM_STATE) # this is a linear SVM
SVM.fit(train, trainLabels)
predictions = SVM.predict(test)

In [19]:
# evaluation the SVM on the test set
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88        45
           1       0.84      0.86      0.85        36

    accuracy                           0.86        81
   macro avg       0.86      0.86      0.86        81
weighted avg       0.86      0.86      0.86        81



In [20]:
print(confusion_matrix(testLabels, predictions))

[[39  6]
 [ 5 31]]
