### Part a

#### Reads data set

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]

In [None]:
X_train, X_test = train_test_split(data, test_size=0.3)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

X_test_features = X_test[:, :-1]
y_test = X_test[:, -1]

#### Training

In [None]:
from collections import defaultdict

def train_naive_bayes(X_train, y_train):

    num_classes = 5
    num_features = X_train.shape[1]
    # Convert term frequencies to binary representation
    binary_train_data =  np.where(X_train > 0, 1, 0)

    
    class_probs = [1/2,1/2,1/2,1/2,1/2]
    class_count=[0,0,0,0,0]
    for label in y_train:
        class_count[label] += 1
    total_docs = len(y_train)

    for c in range(5):
        class_probs[c] =  class_count[c] / total_docs


    # # Calculate term probabilities with Laplace smoothing
    term_occ = np.zeros((num_classes, num_features))
    aij = np.zeros((num_classes, num_features))

    for i in range(len(binary_train_data)):
        label = y_train[i]
        term_occ[label] += binary_train_data[i]

    
    for label in range(5):
        for i in range(num_features):
            if term_occ[label][i] == 0:
                # Apply Laplace smoothing only if the term probability is 0
                aij[label][i] = 1 / (class_count[label] + 2)
            else:
                # If the term probability is not 0, use the original probability
                aij[label][i] = term_occ[label][i] / class_count[label]

    
    return class_probs, aij


#### Predict

In [None]:
def predict_naive_bayes(X_test, class_probs, term_probs):
    predictions = []
    num_features = X_test.shape[1]
    # Convert term frequencies to binary representation
    binary_test_data =  np.where(X_test > 0, 1, 0)


    for k in range(len(binary_test_data)):
        alpha = [0,0,0,0,0]
        for j in range(5):
            for i in range(num_features):
                if binary_test_data[k][i] != 0:
                    alpha[j] += np.log(term_probs[j][i])
                else:
                    alpha[j] += np.log(1-term_probs[j][i])
            alpha[j] += np.log(class_probs[j])
        gamma = np.min(alpha)
        posterier = [0,0,0,0,0]
        exp = np.sum(np.exp(alpha-gamma))
        for j in range(5):
            posterier[j] = np.exp(alpha[j] - gamma)/exp
        predictions.append(np.argmax(posterier))
    return np.array(predictions)


# Train the model
class_probs, term_probs = train_naive_bayes(X_train_features, y_train)

# Predict on the test set
y_pred = predict_naive_bayes(X_test_features, class_probs, term_probs)

#### Accuracy

In [None]:
train_predictions = predict_naive_bayes(X_train_features, class_probs, term_probs)
test_predictions = predict_naive_bayes(X_test_features, class_probs, term_probs)

# Compare predictions with actual labels
train_accuracy = np.mean(train_predictions == y_train)
test_accuracy = np.mean(test_predictions == y_test)

# Print classification accuracy
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


### Part b

#### Load


In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]
X_train, X_test = train_test_split(data, test_size=0.2)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

X_test_features = X_test[:, :-1]
y_test = X_test[:, -1]

#### Training

In [53]:
X_test_features

array([[0, 0, 0, ..., 0, 0, 0],
       [2, 0, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [59]:
def calculate_class_statistics(X_train, y_train):
    unique_classes = np.unique(y_train)
    class_means = []
    class_covariances = []
    regularized_covariances = []
    inv_covs = []

    for class_ in range(5):
        class_mask = (y_train == class_)
        class_features = X_train[class_mask]
        
        class_mean = np.mean(class_features, axis=0)
        class_covariance = np.cov(class_features, rowvar=False)
        
        class_means.append(class_mean)
        class_covariances.append(class_covariance)

        regularized_covariance = class_covariance + np.eye(class_covariance.shape[0]) * 0.0001
        regularized_covariances.append(regularized_covariance)

        inv_cov = np.linalg.inv(regularized_covariance)
        inv_covs.append(inv_cov)

    return np.array(class_means), np.array(inv_covs)




In [60]:
def calculate_priors(y_train):
    unique_classes, counts = np.unique(y_train, return_counts=True)
    total_docs = len(y_train)
    priors = counts / total_docs
    log_priors = np.log(priors)
    return log_priors

y = calculate_priors(y_train)
print(y)

[-1.47555546 -1.76405687 -1.69135503 -1.46577543 -1.68831089]


In [78]:
def log_multivariate_gaussian_pdf(x, mu, inv_sigma):

    n = mu.shape[0]

    diff = x - mu
    exponent = - 0.5 * diff.T @ inv_sigma @ diff

    return  exponent


In [79]:
def gcc_classify(X, class_means,  inv_covs, priors):
    num_classes = 5

    class_probs = np.zeros(5)
    predictions = np.array([])
    for row in X:
        for i in range(num_classes):
            mean,  inv_cov = class_means[i], inv_covs[i]

            class_probs[i] = priors[i] + log_multivariate_gaussian_pdf(row, mean, inv_cov)

        # Find the index of the class with the maximum probability for each sample
        predictions = np.append(predictions, np.argmax(class_probs))
    return predictions


In [84]:
# Calculate priors, class statistics
priors = calculate_priors(y_train)
class_means,  invcovs = calculate_class_statistics(X_train_features, y_train)

# Predict on test set
X_test_predictions = gcc_classify(X_test_features, class_means, invcovs, priors) 

X_train_predictions = gcc_classify(X_train_features, class_means, invcovs, priors) 

test_accuracy = np.mean(X_test_predictions == y_test)
train_accuracy_accuracy = np.mean(X_train_predictions == y_train)
print("Test set GCC Classifier Accuracy:", test_accuracy)
print("Test set GCC Classifier Accuracy:", train_accuracy)


#### Accuracy

[2. 2. 3. 3. 2. 4. 4. 0. 0. 1. 4. 1. 0. 3. 1. 1. 3. 2. 3. 3. 3. 3. 2. 4.
 3. 3. 2. 0. 1. 3. 1. 4. 0. 2. 3. 3. 3. 3. 4. 2. 1. 3. 2. 3. 1. 3. 2. 4.
 2. 3. 3. 2. 3. 3. 0. 0. 4. 1. 3. 3. 1. 3. 2. 2. 1. 4. 3. 2. 0. 3. 2. 4.
 3. 1. 1. 0. 0. 3. 1. 0. 1. 2. 3. 3. 0. 1. 2. 0. 1. 2. 2. 3. 2. 2. 3. 0.
 2. 1. 4. 3. 0. 2. 2. 0. 3. 0. 2. 0. 0. 2. 1. 2. 4. 4. 4. 1. 2. 2. 2. 2.
 3. 1. 0. 3. 3. 0. 2. 1. 3. 2. 0. 0. 4. 1. 1. 3. 0. 1. 3. 4. 3. 4. 0. 0.
 2. 1. 3. 2. 4. 4. 0. 0. 0. 4. 0. 2. 1. 1. 3. 3. 4. 0. 1. 2. 0. 0. 1. 1.
 4. 4. 0. 1. 0. 2. 3. 3. 1. 2. 3. 2. 0. 4. 3. 1. 0. 0. 3. 3. 2. 3. 0. 4.
 4. 1. 0. 4. 3. 2. 0. 3. 4. 3. 2. 0. 2. 0. 0. 0. 1. 2. 0. 4. 2. 4. 1. 1.
 0. 1. 3. 1. 3. 1. 0. 0. 0. 0. 3. 3. 3. 4. 2. 2. 1. 0. 4. 4. 0. 2. 2. 0.
 4. 3. 3. 3. 1. 3. 2. 3. 2. 1. 4. 0. 1. 0. 1. 3. 4. 3. 1. 4. 3. 3. 1. 3.
 1. 3. 0. 4. 0. 4. 4. 2. 3. 0. 4. 4. 2. 4. 3. 2. 1. 2. 3. 4. 2. 0. 3. 3.
 1. 0. 4. 2. 3. 2. 4. 4. 1. 0. 0. 2. 1. 3. 4. 0. 1. 2. 0. 0. 1. 4. 0. 3.
 2. 4. 3. 0. 1. 0. 1. 0. 3. 3. 0. 2. 0. 2. 0. 4. 4.