In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [3]:
# Load the training and test data
X_train = np.load('data_train.npy')
X_test = np.load('data_test.npy')

# Load the training labels
y_train = pd.read_csv('label_train.csv')
# Extract only the label column from y_train
y_train = y_train['label'].values

# Load the vocabulary map
vocab_map_raw = np.load('vocab_map.npy', allow_pickle=True)

# Check structure and format vocab_map as a dictionary
vocab_map = {i: term for i, term in enumerate(vocab_map_raw)}


# Split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [4]:
# Naive Bayes Classifier
class NaiveBayes:
    def __init__(self, smoothing=1.0):
        self.class_priors = {}
        self.feature_log_likelihoods = {}
        self.smoothing = smoothing  # Adjustable smoothing factor

    def fit(self, X, y):
        # Number of documents and number of features
        n_documents, n_features = X.shape
        
        # Calculate class priors
        unique_classes, class_counts = np.unique(y, return_counts=True)
        total_docs = n_documents
        self.class_priors = {c: class_counts[i] / total_docs for i, c in enumerate(unique_classes)}
        
        # Calculate feature likelihoods with smoothing
        self.feature_log_likelihoods = {}
        for c in unique_classes:
            class_docs = X[y == c]
            class_feature_sum = np.sum(class_docs, axis=0)
            
            # Applying smoothing, convert likelihoods to log-space
            self.feature_log_likelihoods[c] = np.log((class_feature_sum + self.smoothing) / 
                                                     (np.sum(class_feature_sum) + self.smoothing * n_features))
            
    def _conditional_log_prob(self, x, c):
        # Calculate the log posterior probability for class c
        log_prior = np.log(self.class_priors[c])
        log_likelihood = np.sum(x * self.feature_log_likelihoods[c])
        return log_prior + log_likelihood

    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = {c: self._conditional_log_prob(x, c) for c in self.class_priors}
            # Choose the class with the highest posterior probability
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)

In [7]:
nb_model = NaiveBayes()
nb_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = nb_model.predict(X_val)

# Check the accuracy
accuracy = np.mean(y_val_pred == y_val)
print(f"Validation Accuracy with TF-IDF and N-grams: {accuracy}")
print(f"f1 score: {f1_score(y_val, y_val_pred, average='micro')}")
print(classification_report(y_val, y_val_pred))

Validation Accuracy with TF-IDF and N-grams: 0.7724137931034483
f1 score: 0.7724137931034483
              precision    recall  f1-score   support

           0       0.88      0.81      0.84      1416
           1       0.53      0.65      0.59       469

    accuracy                           0.77      1885
   macro avg       0.71      0.73      0.72      1885
weighted avg       0.79      0.77      0.78      1885

