In [75]:
import numpy as np

training_data = np.loadtxt(open("data/training_spam.csv"), delimiter=",")
testing_data = np.loadtxt(open("data/testing_spam.csv"), delimiter=",")
print("Shape of training data:", training_data.shape)
print("Shape of testing data:", testing_data.shape)

# Splitting the dataset into features and labels
X_tr = training_data[:, 1:]  # Features
y_tr = training_data[:, 0]  # Labels

X_tes = testing_data[:, 1:]  
y_tes = testing_data[:, 0] 



# Calculate prior probabilities P(c) for each class
class_labels = np.unique(y_tr)  # Find all unique class labels in the dataset
prior_probabilities = {label: (y_tr == label).mean() for label in class_labels}
# For each label, calculate the fraction of samples in the dataset that belong to that label

# Prepare to calculate likelihoods P(feature|class) using Laplace smoothing
laplace_alpha = 1  # Laplace smoothing parameter to avoid division by zero
feature_likelihoods = {}  # Initialize a dictionary to store likelihoods for each class

# Loop over each class label to calculate feature likelihoods given the class
for label in class_labels:
    # Select the subset of features corresponding to the current class
    feature_subset = X_tr[y_tr == label]
    
    # Calculate the likelihood of each feature given the class, with Laplace smoothing
    # (Sum of feature values in the class + laplace_alpha) divided by 
    # (Total count of all features in the class + laplace_alpha times number of features)
    # This accounts for the possibility of unseen features (i.e., features with zero frequency in the training data for this class)
    likelihood = (np.sum(feature_subset, axis=0) + laplace_alpha) / \
                 (np.sum(feature_subset) + laplace_alpha * X_tr.shape[1])
    
    # Store the calculated likelihoods for the class in the feature_likelihoods dictionary
    feature_likelihoods[label] = likelihood




def predict(X_new, prior_probabilities, feature_likelihoods):
    
    # Convert prior probabilities to log scale to prevent numerical underflow
    # This is necessary because multiplying many small probabilities can lead to underflow,
    # where the computer represents the number as zero. Logarithms prevent this by
    # transforming multiplication of probabilities into addition of logs.
    log_priors = np.log(list(prior_probabilities.values()))
    
    # Initialize a list to store predictions for each sample in X_new
    predictions = []

    # Iterate over each sample in the new dataset
    for x in X_new:

        # Compute the log likelihoods for each class given the sample
        # This is done by summing the logs of the feature likelihoods, weighted by the feature values in the sample
        # The use of log likelihoods (instead of raw likelihoods) is another measure to prevent numerical underflow
        log_likelihoods = np.array([np.sum(np.log(feature_likelihoods[label]) * x) for label in class_labels])
        
        # Calculate the log posterior probability for each class
        # This is the sum of the log prior and the log likelihood for each class
        # The log posterior is proportional to the probability of the class given the sample,
        # but we don't need to calculate the exact probability because we only need to know which class is most likely
        log_posterior = log_priors + log_likelihoods

        # Choose the class with the highest log posterior probability as the prediction for the current sample
        # np.argmax returns the index of the maximum value in log_posterior, which corresponds to the most likely class
        predictions.append(np.argmax(log_posterior))

    # Return the predictions as a numpy array for consistency and ease of use
    return np.array(predictions)



def calculate_accuracy(predictions, actual_classes):

    # Ensure predictions and actual_classes are numpy arrays to support element-wise comparison
    predictions = np.array(predictions)
    actual_classes = np.array(actual_classes)
    
    # Calculate the number of correct predictions
    correct_predictions = np.sum(predictions == actual_classes)
    
    # Calculate the accuracy: ratio of correct predictions to total number of predictions
    accuracy = correct_predictions / len(actual_classes)
    
    return accuracy



predictions = predict(X_tes, prior_probabilities, feature_likelihoods)
accuracy = calculate_accuracy(predictions, y_tes)
print(f"Accuracy: {accuracy:.2f}")

Shape of training data: (1000, 55)
Shape of testing data: (500, 55)
Accuracy: 0.90


In [76]:
feature_likelihoods

{0.0: array([0.02152484, 0.01227117, 0.03419835, 0.00040233, 0.02816335,
        0.01287467, 0.00261517, 0.00824784, 0.01086301, 0.02051901,
        0.00764434, 0.05310803, 0.01629451, 0.00563267, 0.002414  ,
        0.01227117, 0.01307584, 0.01629451, 0.06920137, 0.00261517,
        0.04224502, 0.001207  , 0.00341984, 0.00261517, 0.04667069,
        0.03460068, 0.03299135, 0.01931201, 0.01629451, 0.01830618,
        0.01247234, 0.009656  , 0.01367934, 0.00985717, 0.01911084,
        0.02072018, 0.03540535, 0.002414  , 0.01408167, 0.01227117,
        0.00563267, 0.01528867, 0.01307584, 0.01247234, 0.03641118,
        0.02112251, 0.002414  , 0.00744317, 0.02273184, 0.0683967 ,
        0.01810501, 0.03580768, 0.01287467, 0.01066184]),
 1.0: array([0.02892562, 0.02380952, 0.04565132, 0.00137741, 0.04998032,
        0.03168044, 0.03109012, 0.02754821, 0.02282566, 0.03620622,
        0.02282566, 0.04781582, 0.02459662, 0.00964187, 0.01082251,
        0.04348682, 0.0334514 , 0.02676112, 0.06

In [77]:
prior_probabilities

{0.0: 0.613, 1.0: 0.387}