### Import Relevant Libraries


In [1]:
import numpy as np
import pandas as pd
import re

from split_data_utils import train_test_spliting
from data_preprocessing import lemmatize_text_with_pos, tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maryk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maryk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maryk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Read Dataset

In [2]:
df = pd.read_excel("Humour_style.xlsx")   # Read Excel dataset 
df = df[['JOKES', 'LABELS']]              # Extract Only the Jokes and Labels Column
df = df[:1263]                            # Read all data from row one to row 1263

#### Split Dataset Into Train and Test

In [6]:
train_ratio = 0.8
seed = 100
x_train, x_test, y_train, y_test = train_test_spliting(df,train_ratio,seed)

print("x_train",x_train.shape)   # Get the shape of the training features (Number of instance, number of features/column)
print("y_train",y_train.shape)   # Get the shape of the training label (Number of instance, number of column)
print("x_test",x_test.shape)
print("y_test",y_test.shape)

print(x_train)


x_train (1010,)
y_train (1010,)
x_test (253,)
y_test (253,)
['4 ways to become a better risk taker'
 '“Never argue with stupid people, they will drag you down to their level and then beat you with experience.”'
 '“Worrying is like paying a debt you don’t owe.”' ...
 "Worker dies at minnesota vikings' stadium construction site"
 "sharps' injuries could pose hiv, hepatitis risk to surgeons"
 "My set is full of them, but I have a go to bit about how awful it is being a fat chick with small tits that almost always saves me when I'm faltering."]


#### Lemmatize Train and Test datasets 

In [7]:
# Lemmatize each example in the train dataset
lemmatized_x_train  = [lemmatize_text_with_pos(example) for example in x_train]

# Lemmatize each example in the test dataset
lemmatized_x_test  = [lemmatize_text_with_pos(example) for example in x_test]

x_train = np.array(lemmatized_x_train )   # Convert Train data to Numpy Array 
x_test = np.array(lemmatized_x_test)      # Convert Test data to Numpy Array


#### Build Vocabulary

In [8]:
def build_vocab(data):
    """
    Build a vocabulary from the given data.

    Args:
        data (list): List of text examples.

    Returns:
        set: A set containing unique words in the vocabulary.
    """
    all_words = set()
    for example in data:
        all_words.update(tokens(example))
    return all_words

# Example usage
x_train_vocab = build_vocab(x_train)
print(len(x_train_vocab))


3040


#### Naive Bayes Model

In [10]:
def naive_bayes_multi(x, y, smoothing=1):
    """
    Implement a naive Bayes classifier for multi-class classification.

    Args:
        x (list): List of text examples.
        y (list or array): List or array of class labels.
        smoothing (int, optional): Laplace smoothing parameter. Default is 1.

    Returns:
        tuple: Tuple containing log probabilities, word counts, and word probabilities for each class.
    """

    # Given that y contains values 0, 1, 2, 3, 4 for the five classes
    classes = np.unique(y)        # Get all unique values in y
    vocabulary = build_vocab(x)   # Build Vocabulary from the Train data
    N_doc = len(x)            # Get the total number of instances/row

    log_probs = {}              # Dictionary to store the log prior probability of each Class
    ex_dics = {}                # Dictionary to Store count of each word belonging to a class
    prob_words = {}             # Store Log probability of each word belonging to a class

    for class_label in classes:               #Loop through each class
        #Get the total number of examples that belong to each class
        N_cat = sum(y == class_label) 

        # Extract and concatenate examples that belong to same class. 
        # This is done for easy count of words occuring in a class       
        examples = " ".join(x[y == class_label]) # Extract features/text

        log_prob = np.log(N_cat / N_doc)  # Get Log Prior probability (LPP) of each class
        log_probs[class_label] = log_prob # Assign each class their LPP

        ex_dic = {}
        prob_word = {}

        # Loop through Word in the Vocabulary
        for word in vocabulary:
            escaped_word = re.escape(word) # Escape special regex characters
            
            word_count = len(re.findall(escaped_word, examples)) # Using findall to get the total count of each word
            ex_dic[word] = word_count  #Store word and their count in Dictionary 

            # Store Log prob of word. Round Up to 5 Decimal points
            prob_word[word] = np.round(np.log((word_count + smoothing) / (len(tokens(examples)) + len(vocabulary))), 5)

        ex_dics[class_label] = ex_dic           # Stores words and their counts for each class
        prob_words[class_label] = prob_word     # Stores Log prob of words for each class

        # Uncomment the following lines for debugging or detailed output
        #print(f'Class {class_label}:')
        #print(f'Examples: {tokens(examples)}')
        #print(f'Word Counts: {ex_dic}')
        #print(f'Word Probabilities: {prob_word}\n')

    return log_probs, ex_dics, prob_words

# Example usage
#x_train = ["This is a positive example.", "Another positive example.", "A negative example.", "A neutral example."]
#y_train = [0, 0, 1, 2]  # Assuming y contains values 0, 1, 2, 3 for the four classes

log_probs, ex_dics, prob_words = naive_bayes_multi(x_train,y_train)

In [12]:
print(log_probs)

{0: -1.5150282279630256, 1: -1.7027698522263195, 2: -1.764414015337526, 3: -1.6094379124341003, 4: -1.4839836062810654}


#### Prediction using the Naive Bayes Model

In [19]:
def predict_naive_bayes_batch(examples, log_probs, prob_words):
    """
    Predicts using the Naive Bayes classifier for multi-class classification.

    Args:
        x (list): List of text examples.
        log_probs (Dic): Log Prior Probability of each class
        prob_words (Dic): Log Probability of each word belonging to a class

    Returns:
        List: List containing prediction of each test example.
    """
    predictions = []

    for example in examples:
        # Tokenize the example
        example_tokens = tokens(example)

        # Calculate the log likelihoods for each class
        class_likelihoods = {}
        for class_label, log_prob in log_probs.items():
            class_likelihood = log_prob + sum(prob_words[class_label].get(word, 0) for word in example_tokens)
            class_likelihoods[class_label] = class_likelihood

        # Make a prediction based on the class with the highest likelihood
        prediction = max(class_likelihoods, key=class_likelihoods.get)
        predictions.append(prediction)

    return predictions

# Calling the Train and Predict Method
log_probs, ex_dics, prob_words = naive_bayes_multi(x_train, y_train)
predicted_labels = predict_naive_bayes_batch(x_test, log_probs, prob_words)

#### Evaluate Model

In [20]:
def calculate_metrics(predicted_labels, actual_labels):
    """
    Calculate accuracy, precision, recall, and F1-score.

    Args:
        predicted_labels (list): List of predicted class labels.
        actual_labels (list): List of actual class labels.

    Returns:
        dict: Dictionary containing accuracy, precision, recall, F1-score, true_positives, false_positives, and false_negatives.
    """
    correct_predictions = sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == actual)
    accuracy = correct_predictions / len(actual_labels)

    precision = {
        class_label: sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == class_label and actual == class_label) / predicted_labels.count(class_label)
        for class_label in set(actual_labels)
    }

    recall = {
        class_label: sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == class_label and actual == class_label) / sum(1 for actual in actual_labels if actual == class_label)
        for class_label in set(actual_labels)
    }

    f1_score = {
        class_label: 2 * (precision[class_label] * recall[class_label]) / (precision[class_label] + recall[class_label])
        for class_label in set(actual_labels)
    }

    true_positives = {
        class_label: sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == class_label and actual == class_label)
        for class_label in set(actual_labels)
    }

    false_positives = {
        class_label: sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred == class_label and actual != class_label)
        for class_label in set(actual_labels)
    }

    false_negatives = {
        class_label: sum(1 for pred, actual in zip(predicted_labels, actual_labels) if pred != class_label and actual == class_label)
        for class_label in set(actual_labels)
    }


    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'true_positives': true_positives,
        'false_positives': false_positives,
        'false_negatives': false_negatives
    }

    return metrics

# Usage:
metrics = calculate_metrics(predicted_labels, y_test)
print(f'Accuracy: {metrics["accuracy"]:.2%}')

# Print precision, recall, and F1-score for each class
for class_label in set(y_test):
    print(f'Class {class_label}: Precision={metrics["precision"][class_label]:.4f}, Recall={metrics["recall"][class_label]:.4f}, F1-score={metrics["f1_score"][class_label]:.4f}')

# Calculate macro-averaged precision, recall, and F1-score
macro_precision = sum(metrics['precision'].values()) / len(set(y_test))
macro_recall = sum(metrics['recall'].values()) / len(set(y_test))
macro_f1_score = sum(metrics['f1_score'].values()) / len(set(y_test))

# Print macro-averaged metrics
print(f'\nMacro-Averaged Metrics:')
print(f'Precision: {macro_precision:.2%}, Recall: {macro_recall:.2%}, F1-Score: {macro_f1_score:.2%}')

# Calculate micro-averaged precision, recall, and F1-score
micro_true_positives = sum(metrics['true_positives'].values())
micro_false_positives = sum(metrics['false_positives'].values())
micro_false_negatives = sum(metrics['false_negatives'].values())

micro_precision = micro_true_positives / (micro_true_positives + micro_false_positives) if (micro_true_positives + micro_false_positives) > 0 else 0
micro_recall = micro_true_positives / (micro_true_positives + micro_false_negatives) if (micro_true_positives + micro_false_negatives) > 0 else 0
micro_f1_score = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

# Print micro-averaged metrics
print(f'\nMicro-Averaged Metrics:')
print(f'Precision: {micro_precision:.2%}, Recall: {micro_recall:.2%}, F1-Score: {micro_f1_score:.2%}')

Accuracy: 70.75%
Class 0: Precision=0.6508, Recall=0.7885, F1-score=0.7130
Class 1: Precision=0.7273, Recall=0.6667, F1-score=0.6957
Class 2: Precision=0.5750, Recall=0.5476, F1-score=0.5610
Class 3: Precision=0.6667, Recall=0.7667, F1-score=0.7132
Class 4: Precision=1.0000, Recall=0.7255, F1-score=0.8409

Macro-Averaged Metrics:
Precision: 72.39%, Recall: 69.90%, F1-Score: 70.48%

Micro-Averaged Metrics:
Precision: 70.75%, Recall: 70.75%, F1-Score: 70.75%
