### Part a

#### Reads data set

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

In [3]:
freq

array([[   1,    1,    1],
       [   1,    7,    2],
       [   1,   11,    1],
       ...,
       [9635, 2184,    1],
       [9635, 2186,    1],
       [9635, 2218,    1]])

In [4]:
classes

array([[   0,    0],
       [   1,    0],
       [   2,    0],
       ...,
       [2222,    4],
       [2223,    4],
       [2224,    4]])

In [5]:
terms

array([['ad'],
       ['sale'],
       ['boost'],
       ...,
       ['peripher'],
       ['headphon'],
       ['flavour']], dtype=object)

In [1]:
doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]

X_train, X_test = train_test_split(data, test_size=0.3)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

#print(X_train)
#print(y_train)

#### Training

In [103]:
from collections import defaultdict

def train_naive_bayes(X_train, y_train):

    num_classes = 5
    num_features = X_train.shape[1] - 1
    
    # Convert term frequencies to binary representation
    binary_train_data = np.where(X_train[:, :-1] > 0, 1, 0)


     # Calculate class probabilities
    class_probs = defaultdict(int)

    for label in y_train:
        class_probs[label] += 1
    total_docs = len(y_train)

    for label in class_probs:
        class_probs[label] /= total_docs


    # # Calculate term probabilities with Laplace smoothing
    term_probs = np.zeros((num_classes, num_features))


    for i in range(len(binary_train_data)):
        label = y_train[i]
        term_probs[label] += binary_train_data[i]
    

    for label in range(num_classes):
        total_terms = num_features + 2  # Laplace smoothing
        term_probs[label] = (term_probs[label] + 1) / total_terms
    
    print(term_probs)
    return class_probs, term_probs


#### Predict

In [104]:
def predict_naive_bayes(X_test, class_probs, term_probs):
    predictions = []
    
    # Convert term frequencies to binary representation
    binary_test_data = np.where(X_test[:, :-1] > 0, 1, 0)

    for i in range(len(binary_test_data)):
        scores = defaultdict(float)
        for label, class_prob in class_probs.items():
            scores[label] += np.log(class_prob)
            for j in range(len(binary_test_data[i])):
                scores[label] += binary_test_data[i][j] * np.log(term_probs[label][j])

        predicted_label = max(scores, key=scores.get)
        predictions.append(predicted_label)

    return predictions



# Train the model
class_probabilities, conditional_probabilities = train_naive_bayes(X_train_features, y_train)

# Predict on the test set
y_pred = predict_naive_bayes(X_test[:, :-1], class_probabilities, conditional_probabilities)

[[2.31783283e-03 2.60756193e-03 1.44864552e-03 ... 2.41440919e-05
  2.41440919e-05 2.41440919e-05]
 [2.07373272e-03 1.61290323e-03 2.63331139e-04 ... 3.29163924e-05
  3.29163924e-05 3.29163924e-05]
 [2.48968395e-03 2.07473662e-04 5.53263100e-04 ... 2.30526292e-05
  2.30526292e-05 2.30526292e-05]
 [2.76536382e-03 5.43196464e-04 3.20979729e-04 ... 2.46907484e-05
  2.46907484e-05 2.46907484e-05]
 [1.78746594e-03 1.24250681e-03 2.83378747e-04 ... 8.71934605e-05
  6.53950954e-05 6.53950954e-05]]


#### Accuracy

In [None]:
def calculate_accuracy(expected, actual):
    correct_num=0
    for i in range(expected.shape[0]):
        if (expected[i,0] == actual[i,0]):
            correct_num += 1
    return correct_num/expected.shape[0]


# calculate accuracy of training data
accuracy_train = calculate_accuracy(NB_result_train, X_train[:,X_train.shape[1]-1].reshape((X_train.shape[0], 1)))
print("The accuracy of training data is ", accuracy_train, "\n")

# calculate accuracy of testing data
accuracy_test = calculate_accuracy(NB_result_test, X_test[:,X_test.shape[1]-1].reshape((X_test.shape[0], 1)))
print("The accuracy of testing data is ", accuracy_test, "\n")


### Part b

#### Load


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 0]

X_train, X_test = train_test_split(data, test_size=0.3)



#### Training

In [None]:
def predict_naive_bayes(X, class_probabilities, conditional_probabilities):
    predictions = []
    for x in X:
        posteriors = []
        for c in class_probabilities.keys():
            prior = np.log(class_probabilities[c])
            conditional = np.log(conditional_probabilities[c] * x + (1 - conditional_probabilities[c]) * (1 - x)).sum()
            posterior = prior + conditional
            posteriors.append((c, posterior))
        predictions.append(max(posteriors, key=lambda x: x[1])[0])
    return np.array(predictions)

# Train the model
class_probabilities, conditional_probabilities = train_naive_bayes(X_train_features, y_train)

# Predict on the test set


#### Predict

#### Accuracy