### Part a

#### Reads data set

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]

In [2]:
X_train, X_test = train_test_split(data, test_size=0.3)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

X_test_features = X_test[:, :-1]
y_test = X_test[:, -1]

#### Training

In [3]:
from collections import defaultdict

def train_naive_bayes(X_train, y_train):

    num_classes = 5
    num_features = X_train.shape[1]
    # Convert term frequencies to binary representation
    binary_train_data =  np.where(X_train > 0, 1, 0)

    
    class_probs = [1/2,1/2,1/2,1/2,1/2]
    class_count=[0,0,0,0,0]
    for label in y_train:
        class_count[label] += 1
    total_docs = len(y_train)

    for c in range(5):
        class_probs[c] =  class_count[c] / total_docs


    # # Calculate term probabilities with Laplace smoothing
    term_occ = np.zeros((num_classes, num_features))
    aij = np.zeros((num_classes, num_features))

    for i in range(len(binary_train_data)):
        label = y_train[i]
        term_occ[label] += binary_train_data[i]

    
    for label in range(5):
        for i in range(num_features):
            if term_occ[label][i] == 0:
                # Apply Laplace smoothing only if the term probability is 0
                aij[label][i] = 1 / (class_count[label] + 2)
            else:
                # If the term probability is not 0, use the original probability
                aij[label][i] = term_occ[label][i] / class_count[label]

    
    return class_probs, aij


#### Predict

In [4]:
def predict_naive_bayes(X_test, class_probs, term_probs):
    predictions = []
    num_features = X_test.shape[1]
    # Convert term frequencies to binary representation
    binary_test_data =  np.where(X_test > 0, 1, 0)


    for k in range(len(binary_test_data)):
        alpha = [0,0,0,0,0]
        for j in range(5):
            for i in range(num_features):
                if binary_test_data[k][i] != 0:
                    alpha[j] += np.log(term_probs[j][i])
                else:
                    alpha[j] += np.log(1-term_probs[j][i])
            alpha[j] += np.log(class_probs[j])
        gamma = np.min(alpha)
        posterier = [0,0,0,0,0]
        exp = np.sum(np.exp(alpha-gamma))
        for j in range(5):
            posterier[j] = np.exp(alpha[j] - gamma)/exp
        predictions.append(np.argmax(posterier))
    return np.array(predictions)


# Train the model
class_probs, term_probs = train_naive_bayes(X_train_features, y_train)

# Predict on the test set
y_pred = predict_naive_bayes(X_test_features, class_probs, term_probs)

#### Accuracy

In [5]:
train_predictions = predict_naive_bayes(X_train_features, class_probs, term_probs)
test_predictions = predict_naive_bayes(X_test_features, class_probs, term_probs)

# Compare predictions with actual labels
train_accuracy = np.mean(train_predictions == y_train)
test_accuracy = np.mean(test_predictions == y_test)

# Print classification accuracy
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 98.65%
Testing Accuracy: 96.26%


### Part b

#### Load


In [7]:
freq = pd.read_table("bbc/bbc.mtx", skiprows=2, sep=' ', header=None).astype(int).to_numpy()
classes = pd.read_table("bbc/bbc.classes", index_col=False, skiprows=4, sep=' ', header=None).astype(int).to_numpy()
terms = pd.read_table("bbc/bbc.terms", header=None).to_numpy()

doc_num = classes.shape[0]
feature_num = terms.shape[0]
data = np.zeros((doc_num, feature_num + 1), dtype=int)

# Populate data array
for row in freq:
    term_id, doc_id, frequency = row
    # Assuming doc_id and term_id are 1-indexed, decrement them to make them 0-indexed
    doc_id -= 1
    term_id -= 1
    data[int(doc_id), int(term_id)] = int(frequency)

# Populate the last column of data with class labels
data[:, -1] = classes[:, 1]
X_train, X_test = train_test_split(data, test_size=0.3)

# Separate features and target
X_train_features = X_train[:, :-1]
y_train = X_train[:, -1]

X_test_features = X_test[:, :-1]
y_test = X_test[:, -1]

#### Training

In [13]:
X_test_features

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [88]:
def calculate_class_statistics(X_train, y_train):
    unique_classes = np.unique(y_train)
    class_stats = {}

    for class_ in unique_classes:
        class_features = X_train[y_train == class_]
        class_stats[class_] = {
            'mean': np.mean(class_features, axis=0),
            'covariance': np.cov(class_features, rowvar=False)
        }

    return class_stats


In [89]:
def log_multivariate_gaussian_pdf(x, mu, sigma):
    n = mu.shape[0]
    log_det_sigma = calculate_log_det(sigma)
    sigma_inv = np.linalg.inv(sigma + np.eye(sigma.shape[0]) * 0.0001)  # Regularized inverse
    log_normalization_factor = -0.5 * (n * np.log(2 * np.pi) + log_det_sigma)

    diff = x - mu
    exponent = -0.5 * np.dot(np.dot(diff.T, sigma_inv), diff)

    return log_normalization_factor + exponent


In [90]:
def gcc_classify(X, class_stats, priors):
    predictions = []
    for doc in X:
        class_probs = {}
        for class_, stats in class_stats.items():
            class_prob = np.log(priors[class_])
            mean, covariance = stats['mean'], stats['covariance']
            # Regularize the covariance matrix
            class_prob += log_multivariate_gaussian_pdf(doc, mean, covariance)
            class_probs[class_] = class_prob
        predictions.append(max(class_probs, key=class_probs.get))
    return np.array(predictions)


In [91]:
def calculate_priors(y_train):
    unique_classes, counts = np.unique(y_train, return_counts=True)
    total_docs = len(y_train)
    priors = {class_: count / total_docs for class_, count in zip(unique_classes, counts)}
    return priors


In [92]:
def calculate_log_det(sigma):
    # Regularize the covariance matrix
    regularized_sigma = sigma + np.eye(sigma.shape[0]) * 0.0001
    det = np.linalg.det(regularized_sigma)
    if det <= 0:
        # Handling non-positive determinant by returning a very large negative number
        return -np.inf
    return np.log(det)


In [93]:
# Calculate priors, class statistics
priors = calculate_priors(y_train)
class_stats = calculate_class_statistics(X_train_features, y_train)

# Predict on test set
gcc_predictions = gcc_classify(X_test_features, class_stats, priors)

# Evaluate the classifier
accuracy = np.mean(gcc_predictions == y_test)
print("GCC Classifier Accuracy:", accuracy)


KeyboardInterrupt: 

#### Accuracy