<a href="https://colab.research.google.com/github/Lymamadou/demo-repo2/blob/main/miniproject3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook Setup

In [45]:
!pip install datasets
!pip install transformers



In [46]:
from datasets import load_dataset
from typing import Union
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import sklearn.preprocessing

# Load DAIR-AI/Emotion Split Dataset
Split dataset configuration has a total of 20,000 examples split into train (16,000), validation (2,000)
and test (2,000)

In [47]:
split_train_ds = load_dataset("dair-ai/emotion", "split", split="train")
split_validation_ds = load_dataset("dair-ai/emotion", "split", split="validation")
split_test_ds = load_dataset("dair-ai/emotion", "split", split="test")

# Load DAIR-AI/Emotion Unsplit Dataset
The unsplit dataset configuration has a total of 416,809 examples in a single set. Need to split into train,
validation and test splits manually if making use of this set. Note that in the handout it spcificies
'You need to use only data in the “train” category for training and report the performance from the “test” category'

In [48]:

unsplit_ds = load_dataset("dair-ai/emotion", "unsplit", split="train")

# Task 1: Preprocess dataset

## Naive Bayes Pre-processing

In [49]:
# some keyword params that could be tuned to our problem set. Currently using default values
# can normalize tuning the min_df and max_df args, which are in and max thresholds for terms observed in corpus
vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, max_features=None, stop_words='english')

# fit our vectorizer to our train, validation and test data splits
X_all = vectorizer.fit_transform(split_train_ds["text"] + split_validation_ds["text"] + split_test_ds["text"])
X_all = X_all.toarray()
y_all = split_train_ds["label"] + split_validation_ds["label"] + split_test_ds["label"]
y_all = np.array(y_all)

i = len(split_train_ds)
j = len(split_validation_ds)
k = len(split_test_ds)

X_train, X_validation, X_test = X_all[0: i], X_all[i: i + j], X_all[i + j: i + j + k]
y_train, y_validation, y_test = y_all[0: i], y_all[i: i + j], y_all[i + j: i + j + k]

In [50]:
print(X_train)
X_test.shape

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(2000, 16774)

## BERT Model Pre-processing

In [55]:
!pip install accelerate -U



In [56]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
import torch
import tempfile

# Load pretrained model and tokenizer
model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Prepare your dataset (replace this with your dataset preparation code)
train_texts = split_train_ds["text"]  # List of training text examples
train_labels = split_train_ds["label"]  # List of corresponding training labels

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)

# Define training arguments
training_args = TrainingArguments(
    tempfile.mkdtemp(),  # Replace with your desired output directory
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir="logs",
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    #eval_dataset=val_encodings,  # Prepare a validation set for evaluation
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
#model.save_pretrained("fine_tuned_model")

# Now you can use this fine-tuned model for prediction

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: ignored

# Task 2: Implement Naive Bayes and BERT models

## Naive Bayes Implementation

In [None]:
class BayesianNaiveBayes:
    """Implementation of a Bayesian Naive Bayes for a multinomial distribution. Assumes a Dirichlet-Multinomial
    Distribution"""

    def __init__(self, class_alpha=1.0, feature_alpha=1.0):
        """
        :param class_alpha: The symmetric alpha to use for our class prior
        :param feature_alpha: The symmetric alpha to use for our features prior

        """
        self.class_probs = None
        self.feature_probs = None
        self.class_alpha = class_alpha
        self.feature_alpha = feature_alpha

    def fit(self, X:np.ndarray, y:np.ndarray):
        """Fits the model to the data by computing the MLEs for the classes and the features, also employing
        priors using the class and feature alphas
        :param X: A document-term matrix (rows as documents, and term frequencies as columns)
        :param y: A list of labels, ordered with respect to the rows of X
        """
        num_samples, num_features = X.shape
        classes = np.unique(y)
        num_classes = len(classes)

        # computing class probabilities
        self.class_probs = (np.bincount(y) + self.class_alpha) / (len(y) + self.class_alpha * num_classes)

        # compute feature probabilities per class
        self.feature_probs = np.zeros((num_classes, num_features))
        for i, c in enumerate(classes):
            class_samples = X[y == c]  # get all documents which are labeled as class c

            # total word count in all documents labeled as class c. Must account for the Dirichlet prior per feature
            total_count = np.sum(class_samples) + num_features * self.feature_alpha
            for j in range(num_features):
                feature_count = np.sum(class_samples[:, j]) + self.feature_alpha  # apply Dirichlet prior
                # probability of feature given class c
                self.feature_probs[i, j] = feature_count / total_count

    def predict(self, X: np.ndarray):
        """Predicts the labels for the data passed.
        :param X: np.ndarray Expects as input a 2D list with rows as documents and columns as word frequencies in the document
        """
        num_samples, num_features = X.shape
        num_classes = len(self.class_probs)
        predictions = np.zeros(num_samples)

        for sample_idx in range(num_samples):
            sample_loglikelihoods = np.zeros(num_classes)

            # Compute the log likelihood of the sample for each class
            for class_idx in range(num_classes):
                sample_loglikelihoods[class_idx] = np.log(self.class_probs[class_idx])  # begin with prior for class
                for feature_idx in range(num_features):
                    # Use the Dirichlet-Multinomial probability mass function for count features
                    # Go slides 12-pg19 log(p-tilde) to see why we do this
                    # TODO Verify correct computation of loglikelihood for multinomial, multiclass problems
                    sample_loglikelihoods[class_idx] += X[sample_idx, feature_idx] * np.log(
                        self.feature_probs[class_idx, feature_idx])

            # TODO: Double check logic for log-sum-exp
            # Utilize log-sum-exp trick to prevent underflow
            max_loglikelihood = np.max(sample_loglikelihoods)
            exp_shift_loglikelihoods = np.exp(sample_loglikelihoods - max_loglikelihood)
            log_sum_exp_maxshifted_loglikelihood = max_loglikelihood + np.log(np.sum(exp_shift_loglikelihoods))
            normalized_loglikelihoods = sample_loglikelihoods - log_sum_exp_maxshifted_loglikelihood

            # Assign the class with the highest log likelihood
            predictions[sample_idx] = np.argmax(normalized_loglikelihoods)

        return predictions

    def evaluate_acc(self, y_pred, y_true):
        """
        Evaluate the accuracy of the classifier.

        Parameters:
        - y_pred: Predicted class labels
        - y_true: True class labels

        Returns:
        - accuracy: Classification accuracy
        """
        correct_predictions = np.sum(y_pred == y_true)
        total_samples = len(y_true)
        accuracy = correct_predictions / total_samples


        return accuracy

NameError: ignored