# **<u>Naive Bayes Sentence Classifier</u>**
**Class**: Northwestern CS 349 Fall 2024<br>
**Professor**: David Demeter<br>
**Contributers**: Raymond Gu, Mimi Zhang, Alvin Xu, Rhema Phiri, Eshan Haq

## **<u>Import Libraries & Modules</u>**

In [8]:
# Import Pandas Module
import pandas as pd

# Import SKLearn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer

# Import NLTK (Parsing Text)
import nltk

# Import Numpy Module
import numpy as np

# Download NLTK data
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## **<u>Creating Train, Validation, and Test Subsets</u>**
The original dataset is labelled `sentences.csv` and this function produces `train.csv`, `valid.csv`, and `test.csv`.

In [9]:
def get_datasets(dataset_name):

    # Get the paths of each of the datasets
    dataset = dataset_name + ".csv"

    df = pd.read_csv(dataset)

    # Drop Kaggle Index Column
    df = df.drop('Unnamed: 0', axis=1)

    # Removing duplicates
    df = df.drop_duplicates()

    df.columns = ['Sentence', 'Label']

    # Ensuring there is even distribution of labels
    sample_size = 37500
    num_labels = df['Label'].nunique()
    samples_per_label = sample_size // num_labels
    even_df = pd.concat([resample(group, n_samples=samples_per_label, random_state=42) for _, group in df.groupby('Label')])

    even_df = even_df.sample(frac=1, random_state=42).reset_index(drop=True)

    X = even_df['Sentence']
    y = even_df['Label']

    # Splitting dataset
    X_train, X_other, y_train, y_other = train_test_split( X,y , random_state=104,test_size=0.2, stratify=y)

    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, random_state=42,test_size=0.5, stratify=y_other)

    # Convert to Dataframe and then convert to csv files
    train_data = {'Label' : y_train, 'Sentence' : X_train}
    train_data = pd.DataFrame(train_data)
    train_data.to_csv('train.csv', index=False)

    test_data = {'Label' : y_test, 'Sentence' : X_test}
    test_data = pd.DataFrame(test_data)
    test_data.to_csv('test.csv', index=False)

    valid_data = {'Label' : y_val, 'Sentence' : X_val}
    valid_data = pd.DataFrame(valid_data)
    valid_data.to_csv('valid.csv', index=False)

    return None

get_datasets('sentences')

## **<u>Naive Bayes Classifier Code</u>**
This section contains all the code for our Naive Bayes Classifier.

> ### <u>Section 1: Naive Bayes Class</u>
This code defines the structure of our Naive Bayes classifier.

In [10]:
class Naive_Bayes():

  # Initializer
  # --------------------------------------------------------------------------------
  def __init__(self):
      # Create a dictionary that maps label to the classification
      self.emotionLabels = {0: 'sadness',
                            1: 'joy',
                            2: 'love',
                            3: 'anger',
                            4: 'fear',
                            5: 'surprise'}

      # Initialize TF-IDF Vectorizer to be trained
      self.vectorizer = TfidfVectorizer()

      # Class attribute to store probability of each word given the class
      self.word_probs = None

      # Default Additive Smoothing parameter (so no probability is 0)
      self.alpha = 1

  # Train Function
  # --------------------------------------------------------------------------------
  def train(self, train, valid):
      '''
      Purpose: Trains the model using BOTH the training data and validation data.

      Arguments:
      - Train: The path to the training data.
      - Valid: The path to the validation data.
      '''
      # Read the training data and load the sentences into a dataframe
      train_data = pd.read_csv(train)
      train_data.columns = ['Label', 'Sentence']
      train_sentences = train_data['Sentence'].tolist()

      # Read the validation data and load the sentences into a dataframe
      valid_data = pd.read_csv(valid)
      valid_data.columns = ['Label', 'Sentence']
      valid_sentences = valid_data['Sentence'].tolist()

      # Combine all sentences and fit the vectorizer
      corpus = train_sentences + valid_sentences
      self.vectorizer.fit(corpus)

      # Transform train and valid sentences using the fitted vectorizer
      x_train = self.vectorizer.transform(train_sentences).toarray()
      x_valid = self.vectorizer.transform(valid_sentences).toarray()

      # Extract the labels from the training and validation dataframes
      y_train = train_data['Label'].values
      y_valid = valid_data['Label'].values

      # Tune the value of our Additive smoothing parameter
      best_alpha = 1
      best_accuracy = 0
      for alpha in np.linspace(0.02, 1, 50):
          self._fit(x_train, y_train, alpha)
          accuracy = self._validate(x_valid, y_valid)
          if accuracy > best_accuracy:
              best_accuracy = accuracy
              best_alpha = alpha

      # Train the final model with the best alpha
      self._fit(x_train, y_train, best_alpha)

  # Fit Function
  # --------------------------------------------------------------------------------
  def _fit(self, x_train, y_train, alpha):
      '''
      Purpose: This function helps the model learn conditional probabilities by
               calculating the probability of each word occuring in each class.

      Note: The smoothing parameter (alpha) is used to handle the issue of zero
            probabilities. This issue can happen if a word does not occur in the
            training dataset.

      Arguments:
      - X_Train: The training data.
      - Y_Train: The labels for the training data.
      - Alpha: The Laplace smoothing parameter.
      '''
      # Set the Laplace Smoothing constant to the inputted argument
      self.alpha = alpha
      n_classes = 6
      n_features = x_train.shape[1]

      # Initialize word probabilities for each class
      self.word_probs = np.zeros((n_classes, n_features))

      # Compute word probabilities for each class
      for c in range(n_classes):
          class_indices = (y_train == c)
          class_data = x_train[class_indices]

          # Get the total number occurances a word occurs in the class
          total_occurances_word = np.sum(class_data, axis=0)

          # Get the total number of words that occur in the class
          total_words_class = np.sum(class_data)

          # Calculate the smooth probability for the word
          numerator = total_occurances_word + alpha
          denominator = total_words_class + alpha * n_features
          self.word_probs[c, :] = numerator / denominator

  # Validate Function
  # --------------------------------------------------------------------------------
  def _validate(self, x_valid, y_valid):
      '''
      Purpose: Evaluates how well the model generalizes to unseen validation data.
      '''
      y_pred = self._predict(x_valid)
      return accuracy_score(y_valid, y_pred)

  # Predict Function
  # --------------------------------------------------------------------------------
  def _predict(self, x_test):
      '''
      Purpose: Predicts the classes of a list of a sentences.
      '''
      log_word_probs = np.dot(x_test, np.log(self.word_probs.T))
      return np.argmax(log_word_probs, axis=1)

  # Predict Report Function
  # --------------------------------------------------------------------------------
  def printReport(self, test):
      '''
      Purpose: Prints the performance report of the model that includes metrics such
               as precision, recall, and f1-score.
      '''
      # Read the test data and load the sentences into a dataframe
      test_data = pd.read_csv(test)
      test_data.columns = ['Label', 'Sentence']
      test_sentences = test_data['Sentence'].tolist()

      # Transform test sentences using the model's vectorizer
      x_test = self.vectorizer.transform(test_sentences).toarray()

      # Extract the labels from the dataframe
      y_test = test_data['Label'].values

      # Get the predictions of the model
      y_pred = self._predict(x_test)

      # Print the performance report
      print(classification_report(y_test, y_pred))


> ### <u>Section 2: Performance Report</u>
This code tests the model on the test dataset and returns an accuracy report.

In [11]:
# Initialize the model
emotion_classifier = Naive_Bayes()

# Train the model using the training and validation sets
emotion_classifier.train("train.csv", "valid.csv")

# Print the performance report for the model
emotion_classifier.printReport("test.csv")

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       625
           1       0.87      0.80      0.83       625
           2       0.85      0.90      0.88       625
           3       0.92      0.86      0.89       625
           4       0.85      0.84      0.85       625
           5       0.81      0.92      0.86       625

    accuracy                           0.86      3750
   macro avg       0.87      0.86      0.86      3750
weighted avg       0.87      0.86      0.86      3750

