### Part a

#### Reads data set

In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]

X_train, X_test = train_test_split(data, test_size=0.3)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

X_test_features = X_test[:, :-1]
y_test = X_test[:, -1]



#### Training

In [120]:
from collections import defaultdict

def train_naive_bayes(X_train, y_train):

    num_classes = 5
    num_features = X_train.shape[1]
    # Convert term frequencies to binary representation
    binary_train_data =  np.where(X_train > 0, 1, 0)

    
    class_probs = [1/2,1/2,1/2,1/2,1/2]
    class_count=[0,0,0,0,0]
    for label in y_train:
        class_count[label] += 1
    total_docs = len(y_train)

    for c in range(5):
        class_probs[c] =  class_count[c] / total_docs


    # # Calculate term probabilities with Laplace smoothing
    term_occ = np.zeros((num_classes, num_features))
    aij = np.zeros((num_classes, num_features))

    for i in range(len(binary_train_data)):
        label = y_train[i]
        term_occ[label] += binary_train_data[i]

    
    for label in range(5):
        for i in range(num_features):
            if term_occ[label][i] == 0:
                # Apply Laplace smoothing only if the term probability is 0
                aij[label][i] = 1 / (class_count[label] + 2)
            else:
                # If the term probability is not 0, use the original probability
                aij[label][i] = term_occ[label][i] / class_count[label]

    
    return class_probs, aij


#### Predict

In [121]:
def predict_naive_bayes(X_test, class_probs, term_probs):
    predictions = []
    num_features = X_test.shape[1]
    # Convert term frequencies to binary representation
    binary_test_data =  np.where(X_test > 0, 1, 0)


    for k in range(len(binary_test_data)):
        alpha = [0,0,0,0,0]
        for j in range(5):
            for i in range(num_features):
                if binary_test_data[k][i] != 0:
                    alpha[j] += np.log(term_probs[j][i])
                else:
                    alpha[j] += np.log(1-term_probs[j][i])
            alpha[j] += np.log(class_probs[j])
        gamma = np.min(alpha)
        posterier = [0,0,0,0,0]
        exp = np.sum(np.exp(alpha-gamma))
        for j in range(5):
            posterier[j] = np.exp(alpha[j] - gamma)/exp
        predictions.append(np.argmax(posterier))
    return np.array(predictions)


# Train the model
class_probs, term_probs = train_naive_bayes(X_train_features, y_train)

# Predict on the test set
y_pred = predict_naive_bayes(X_test_features, class_probs, term_probs)

#### Accuracy

In [122]:
train_predictions = predict_naive_bayes(X_train_features, class_probs, term_probs)
test_predictions = predict_naive_bayes(X_test_features, class_probs, term_probs)

# Compare predictions with actual labels
train_accuracy = np.mean(train_predictions == y_train)
test_accuracy = np.mean(test_predictions == y_test)

# Print classification accuracy
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 98.46%
Testing Accuracy: 95.36%


### Part b

#### Load


In [123]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 0]

X_train, X_test = train_test_split(data, test_size=0.3)



#### Training

In [124]:
def predict_naive_bayes(X, class_probabilities, conditional_probabilities):
    predictions = []
    for x in X:
        posteriors = []
        for c in class_probabilities.keys():
            prior = np.log(class_probabilities[c])
            conditional = np.log(conditional_probabilities[c] * x + (1 - conditional_probabilities[c]) * (1 - x)).sum()
            posterior = prior + conditional
            posteriors.append((c, posterior))
        predictions.append(max(posteriors, key=lambda x: x[1])[0])
    return np.array(predictions)

# Train the model
class_probabilities, conditional_probabilities = train_naive_bayes(X_train_features, y_train)

# Predict on the test set


#### Predict

#### Accuracy