# Naive Bayes Classifier implementation for Kaggle competition 1
## Milestone 1: Beat coinflip prediction model and logistic regression prediction model

### By Emiliano Aviles and Cassandre Hamel

## I) Load and process data

In [1]:
import numpy as np

In [3]:
data_train = np.load('data_train.npy', allow_pickle=True)

data_test = np.load('data_test.npy', allow_pickle=True)

labels_train = np.loadtxt('label_train.csv', delimiter=',', skiprows=1, usecols=1)

In [4]:
np.random.seed(12345)

split_ratio = 0.8
n_samples = data_train.shape[0]
n_train = int(n_samples * split_ratio)

# Shuffle the dataset (important to ensure random splitting)
shuffled_indices = np.random.permutation(n_samples)
train_indices = shuffled_indices[:n_train]
val_indices = shuffled_indices[n_train:]

# Split data and labels into training and validation sets
X_train, X_val = data_train[train_indices], data_train[val_indices]
y_train, y_val = labels_train[train_indices], labels_train[val_indices]

## II) Naive Bayes classifier

In [5]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_log_prior_ = None  # Log prior for each class
        self.feature_log_prob_ = None  # Log probability of each feature given the class
        self.classes_ = None  # The unique classes
    
    def fit(self, X, y):
        # Get unique class labels and their counts
        self.classes_, class_counts = np.unique(y, return_counts=True)
        
        # Calculate log prior probabilities for each class
        self.class_log_prior_ = np.log(class_counts / y.shape[0])
        
        # Calculate the number of features (words)
        n_features = X.shape[1]
        
        # Initialize an array to count the occurrences of each feature for each class
        feature_count = np.zeros((len(self.classes_), n_features))
        
        # Count occurrences of each feature for each class (with Laplace smoothing)
        for i, c in enumerate(self.classes_):
            feature_count[i, :] = X[y == c].sum(axis=0) + 1  # Add 1 for Laplace smoothing
        
        # Calculate the log probabilities of each feature given the class
        feature_totals = feature_count.sum(axis=1, keepdims=True)
        self.feature_log_prob_ = np.log(feature_count / feature_totals)
    
    def predict(self, X):
        # Compute the log probabilities for each class
        log_probs = []
        for i, c in enumerate(self.classes_):
            log_prob_c = self.class_log_prior_[i] + np.dot(X, self.feature_log_prob_[i].T)
            log_probs.append(log_prob_c)
        
        log_probs = np.array(log_probs).T
        
        # Return the class with the highest log probability
        return self.classes_[np.argmax(log_probs, axis=1)]

## III) Model fitting and predictions

In [6]:
# Initialize and fit the Naive Bayes classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

In [7]:
# Predict on the validation set
y_val_pred = nb_classifier.predict(X_val)

# Calculate accuracy
accuracy = np.mean(y_val_pred == y_val)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.7628647214854112


In [8]:
# Function to calculate precision, recall, and F1 score
def f1_score_macro(y_true, y_pred):
    unique_classes = np.unique(y_true)
    f1_scores = []
    
    for cls in unique_classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))  # True Positives
        fp = np.sum((y_pred == cls) & (y_true != cls))  # False Positives
        fn = np.sum((y_pred != cls) & (y_true == cls))  # False Negatives
        
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0
        
        f1_scores.append(f1)
    
    # Macro F1 score is the average of the F1 scores for each class
    return np.mean(f1_scores)

# Calculate macro F1 score
macro_f1 = f1_score_macro(y_val, y_val_pred)
print(f"Macro F1 Score: {macro_f1}")


Macro F1 Score: 0.7085080790374689


In [32]:
y_test_pred = nb_classifier.predict(data_test)

IDs = np.array(range(len(y_test_pred)))

output = np.hstack((IDs.reshape(len(IDs), 1), y_test_pred.reshape(len(y_test_pred), 1)))

# Save the predicted labels for the test set (to submit or evaluate externally)
np.savetxt('test_predictions.csv', output, delimiter=',', fmt='%d', header='ID,label', comments='')