In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the 20 Newsgroups dataset
categories = ['alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x',
'misc.forsale','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','sci.crypt','sci.electronics','sci.med','sci.space',
              'soc.religion.christian','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Extract features using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

# Train Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=newsgroups_test.target_names))


Accuracy: 0.7728

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.79      0.77      0.78       319
           comp.graphics       0.67      0.74      0.70       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.56      0.77      0.65       392
   comp.sys.mac.hardware       0.84      0.75      0.79       385
          comp.windows.x       0.65      0.84      0.73       395
            misc.forsale       0.93      0.65      0.77       390
               rec.autos       0.87      0.91      0.89       396
         rec.motorcycles       0.96      0.92      0.94       398
      rec.sport.baseball       0.96      0.87      0.91       397
        rec.sport.hockey       0.93      0.96      0.95       399
               sci.crypt       0.67      0.95      0.78       396
         sci.electronics       0.79      0.66      0.72       393
                 sci.med       0.8

In [2]:
import numpy as np

class NaiveBayes:
    def __init__(self):
        self.class_probabilities = None
        self.feature_probabilities = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        # Compute class probabilities
        self.class_probabilities = np.zeros(n_classes)
        for i, c in enumerate(self.classes):
            self.class_probabilities[i] = np.sum(y == c) / n_samples
        
        # Compute feature probabilities for each class
        self.feature_probabilities = np.zeros((n_classes, n_features))
        for i, c in enumerate(self.classes):
            class_samples = X[y == c]
            self.feature_probabilities[i] = (np.sum(class_samples, axis=0) + 1) / (np.sum(class_samples) + n_features)
    
    def predict(self, X):
        n_samples, n_features = X.shape
        predictions = []
        
        for sample in X:
            posteriors = []
            for i, c in enumerate(self.classes):
                log_class_probability = np.log(self.class_probabilities[i])
                log_feature_probability = np.sum(np.log(self.feature_probabilities[i]) * sample)
                posterior = log_class_probability + log_feature_probability
                posteriors.append(posterior)
            predicted_class = self.classes[np.argmax(posteriors)]
            predictions.append(predicted_class)
        
        return np.array(predictions)


In [4]:
# Instantiate and train your Naive Bayes classifier
nb_scratch = NaiveBayes()
nb_scratch.fit(X_train.toarray(), y_train)

# Predict using your Naive Bayes classifier
y_pred_scratch = nb_scratch.predict(X_test.toarray())

# Evaluate accuracy
accuracy_scratch = accuracy_score(y_test, y_pred_scratch)
print(f"Accuracy (Your Naive Bayes Implementation): {accuracy_scratch:.4f}")

# Display classification report for your Naive Bayes classifier
print("\nClassification Report (Your Naive Bayes Implementation):")
print(classification_report(y_test, y_pred_scratch, target_names=newsgroups_test.target_names))


Accuracy (Your Naive Bayes Implementation): 0.7728

Classification Report (Your Naive Bayes Implementation):
                          precision    recall  f1-score   support

             alt.atheism       0.79      0.77      0.78       319
           comp.graphics       0.67      0.74      0.70       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.56      0.77      0.65       392
   comp.sys.mac.hardware       0.84      0.75      0.79       385
          comp.windows.x       0.65      0.84      0.73       395
            misc.forsale       0.93      0.65      0.77       390
               rec.autos       0.87      0.91      0.89       396
         rec.motorcycles       0.96      0.92      0.94       398
      rec.sport.baseball       0.96      0.87      0.91       397
        rec.sport.hockey       0.93      0.96      0.95       399
               sci.crypt       0.67      0.95      0.78       396
         sci.electronics       0