In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics
from collections import defaultdict

In [2]:
file = 'pklFiles/df.pkl'
with open(file, 'rb') as fileobj:
    df = pickle.load(fileobj)

In [3]:
X = df['title']
y = df['ENCODED_CATEGORY']

In [4]:
np.random.seed(42)
shuffled_df = df.sample(frac=1).reset_index(drop=True)
split_ratio = 0.8
split_index = int(len(shuffled_df) * split_ratio)

X_train = shuffled_df['title'][:split_index]
y_train = shuffled_df['ENCODED_CATEGORY'][:split_index]
X_test = shuffled_df['title'][split_index:]
y_test = shuffled_df['ENCODED_CATEGORY'][split_index:]

In [5]:
class CountVectorizer:
    def __init__(self):
        self.vocabulary = {}
    
    def _process_document(self, doc):
        if pd.isna(doc):
            return []
        return str(doc).lower().split()
    
    def fit_transform(self, documents):
        word_count = defaultdict(int)
        
        for doc in documents:
            words = self._process_document(doc)
            for word in words:
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
                word_count[word] += 1
        
        return self.transform(documents)
    
    def transform(self, documents):
        matrix = np.zeros((len(documents), len(self.vocabulary)))
        
        for i, doc in enumerate(documents):
            words = self._process_document(doc)
            for word in words:
                if word in self.vocabulary:
                    j = self.vocabulary[word]
                    matrix[i, j] += 1
                    
        return matrix

In [6]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_count = np.zeros(len(self.classes))
        self.feature_count = np.zeros((len(self.classes), X.shape[1]))
        
        for idx, cls in enumerate(self.classes):
            X_cls = X[y == cls]
            self.class_count[idx] = X_cls.shape[0]
            self.feature_count[idx, :] = np.sum(X_cls, axis=0)
        
        self.class_log_prior = np.log(self.class_count / np.sum(self.class_count))
        self.feature_log_prob = np.log((self.feature_count + self.alpha) / 
                                     (self.feature_count.sum(axis=1, keepdims=True) + self.alpha * X.shape[1]))
    
    def predict(self, X):
        log_probs = X @ self.feature_log_prob.T + self.class_log_prior
        return self.classes[np.argmax(log_probs, axis=1)]

In [7]:
# Create instances
count_vectorizer = CountVectorizer()
nb_classifier = MultinomialNB()

# Transform training data
count_X_train = count_vectorizer.fit_transform(X_train)
count_X_test = count_vectorizer.transform(X_test)

# Train the classifier
nb_classifier.fit(count_X_train, y_train)

pred = nb_classifier.predict(count_X_test)

print("Multinomial Naive Bayes: \n")
a_score = metrics.accuracy_score(y_test, pred)
print(f"Accuracy Score: {a_score:.2%}\n")

c_matrix = metrics.confusion_matrix(y_test, pred)
print("Confusion Matrix: \n", c_matrix)

Multinomial Naive Bayes: 

Accuracy Score: 92.61%

Confusion Matrix: 
 [[105   0   1   1   7   0]
 [  1  74   2   2   7   0]
 [  2   0  82   2   2   0]
 [  0   1   0  95   1   0]
 [  2   2   0   2  95   0]
 [  0   0   0   1   0   0]]


In [8]:
encoded_categories = {
    0: 'business',
    1: 'entertainment',
    2: 'politics',
    3: 'sport', 
    4: 'tech'
}

user_headline = input("Enter News Headline: ")
headline_counts = count_vectorizer.transform([user_headline])
predictions = nb_classifier.predict(headline_counts)

predicted_index = predictions[0]
print("Predicted Category:", encoded_categories[predicted_index])

Predicted Category: sport
