In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
categories = ['talk.religion.misc', 'soc.religion.christian',
              'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [4]:
print(train.data[0]) #first 300 words
print("Target: ", train.target[0])  #start with 1, soc.religion.christian

From: jono@mac-ak-24.rtsg.mot.com (Jon Ogden)
Subject: Re: Losing your temper is not a Christian trait
Organization: Motorola LPA Development
Lines: 26

In article <Apr.23.02.55.47.1993.3138@geneva.rutgers.edu>, jcj@tellabs.com
(jcj) wrote:

> I'd like to remind people of the withering of the fig tree and Jesus
> driving the money changers et. al. out of the temple.  I think those
> were two instances of Christ showing anger (as part of His human side).
> 
Yes, and what about Paul saying:

26 Be ye angry, and sin not: let not the sun go down upon your wrath:
(Ephesians 4:26).

Obviously then, we can be angry w/o sinning.

Jon

------------------------------------------------
Jon Ogden         - jono@mac-ak-24.rtsg.mot.com
Motorola Cellular - Advanced Products Division
Voice: 708-632-2521      Data: 708-632-6086
------------------------------------------------

They drew a circle and shut him out.
Heretic, Rebel, a thing to flout.
But Love and I had the wit to win;
We drew a circle and 

In [5]:
class Multinomial_Naive:
    
    
    def likelihood(self,X_train, laplace=1):
        return ((X_train.sum(axis=0)) + laplace) / (np.sum(X_train.sum(axis=0) + laplace))
    
    def prior(self,X_train, m):
        return X_train.shape[0] / m
    
    def select_method(self,train,test,method = None):
        
      
        if method == "TfidfVectorizer":
            vectorizer = TfidfVectorizer()
            X_train = vectorizer.fit_transform(train.data)
            X_test = vectorizer.transform(test.data)
            X_test = X_test.toarray()  

            y_train = train.target
            y_test = test.target
            
   
        elif method == "CountVectorizer":
            vectorizer = CountVectorizer()
            X_train = vectorizer.fit_transform(train.data)
            X_test = vectorizer.transform(test.data)
            X_test = X_test.toarray()  

            y_train = train.target
            y_test = test.target
            
            
        return X_train, y_train, X_test, y_test
    
    def fit(self,X_train, y_train):
        m, n = X_train.shape
        classes = np.unique(y_train)  #list of class
        k = len(classes) #number of class

        priors = np.zeros(k) #prior for each classes
        likelihoods = np.zeros((k, n)) #likehood for each class of each feature

        for idx, label in enumerate(classes):
            X_train_c = X_train[y_train==label]
            priors[idx] = self.prior(X_train_c, m)
            likelihoods[idx, :] = self.likelihood(X_train_c)
            
        return priors, likelihoods
    
    
    
    def predict(self,X_test, priors, likelihoods, classes):
        return np.log(priors) + X_test @ np.log(likelihoods.T)

In [6]:
model = Multinomial_Naive()
X_train, y_train, X_test, y_test = model.select_method(train, test, method = "CountVectorizer")
priors, likelihoods = model.fit(X_train, y_train)
classes = np.unique(y_test)
yhat = model.predict(X_test, priors, likelihoods, classes)
yhat = np.argmax(yhat, axis=1)

In [7]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score, classification_report

n_classes = len(np.unique(y_test))

print("Accuracy: ", np.sum(yhat == y_test)/len(y_test))

print("Average precision score : ")
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3])
yhat_binarized = label_binarize(yhat, classes=[0, 1, 2, 3])

for i in range(n_classes):
    class_score = average_precision_score(y_test_binarized[:, i], yhat_binarized[:, i])
    print(f"Class {i} score: ", class_score)
    
print("Classification Report: ", classification_report(y_test, yhat))

Accuracy:  0.9168994413407822
Average precision score : 
Class 0 score:  0.9152047938418233
Class 1 score:  0.9069918620723723
Class 2 score:  0.8429395016564877
Class 3 score:  0.7277310085946386
Classification Report:                precision    recall  f1-score   support

           0       0.95      0.95      0.95       389
           1       0.94      0.96      0.95       394
           2       0.87      0.95      0.91       398
           3       0.92      0.74      0.82       251

    accuracy                           0.92      1432
   macro avg       0.92      0.90      0.91      1432
weighted avg       0.92      0.92      0.92      1432



In [8]:
model = Multinomial_Naive()
X_train, y_train, X_test, y_test = model.select_method(train, test, method = "TfidfVectorizer")
priors, likelihoods = model.fit(X_train, y_train)
classes = np.unique(y_test)
yhat = model.predict(X_test, priors, likelihoods, classes)
yhat = np.argmax(yhat, axis=1)

In [30]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score, classification_report

n_classes = len(np.unique(y_test))

print("Accuracy: ", np.sum(yhat == y_test)/len(y_test))

print("Average precision score : ")
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3])
yhat_binarized = label_binarize(yhat, classes=[0, 1, 2, 3])

for i in range(n_classes):
    class_score = average_precision_score(y_test_binarized[:, i], yhat_binarized[:, i])
    print(f"Class {i} score: ", class_score)
    
print("Classification Report: ", classification_report(y_test, yhat))

Accuracy:  0.8016759776536313
Average precision score : 
Class 0 score:  0.888341920518241
Class 1 score:  0.8744630809734135
Class 2 score:  0.6122064043881043
Class 3 score:  0.332994836297269
Classification Report:                precision    recall  f1-score   support

           0       0.97      0.88      0.92       389
           1       0.92      0.92      0.92       394
           2       0.62      0.98      0.76       398
           3       1.00      0.19      0.32       251

    accuracy                           0.80      1432
   macro avg       0.88      0.75      0.73      1432
weighted avg       0.86      0.80      0.77      1432

