In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nltk.download('stopwords')

In [None]:
ng_trn = fetch_20newsgroups(subset = 'train')   # Importing the TRAINING SET
ng_tst = fetch_20newsgroups(subset = 'test')    # Importing the TESTING SET

**PART I :- IMPLEMENTING OUR OWN MULTINOMIAL NAIVE BAYES ALGORITHM**

**1)** The Function **string_clean()** is used to CLEAN and PREPROCESS the Text, by removing

i) Stopwords.

ii) Email Addressess.

iii) Punctuations and Extra Whitespaces.

etc.

**2)** The function **make_feature_set()** is used to generate the **Vocabulary Of Words (Feature Set)** (i.e - Our Features of the dataset)


**3)** The function **make_dataset()** is used to create the Dataset (i.e it is used to Update the **frequency** of the words present in the Features according to the **Frequency Of the words present in the Document**

    i) If a Word is PRESENT in the Feature Set, then we Update it's corresponding frequency in the current row.

    ii) If a Word is NOT PRESENT in the Feature Set, then the frequency of the word remains 0.

In [None]:
def string_clean(x):    # Used to preprocess the string
  stop_words = stopwords.words("english")   # Including all the STOPWORDS in English Corpora 
  x = x.lower()   # Converting all words to LOWERCASE for UNIFORMITY
  x = ' '.join([word for word in x.split() if word not in stop_words])
  x = x.encode('ascii', 'ignore').decode()
  x = re.sub(r'https*\S+', ' ', x)
  x = re.sub(r'@\S+', ' ', x)   # Removing EMAIL ADDRESSES from the Document (if any)
  x = re.sub(r'#\S+', ' ', x)   # Removing HASHTAGS from the Document (if any)
  x = re.sub(r'\'\w+', '', x)   # Removing WEBSITES from the Document (if any)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
  x = re.sub(r'\w*\d+\w*', '', x)   # Removing NUMBERS
  x = re.sub(r'\s{2,}', ' ', x)
  return x


def make_feature_set(X):
  d = {}          # Creating a dictionary to store all the Words with their Frequencies.
  string_doc = []   # Creating a LIST to store the PREPROCESSED TEXTS (i.e - TEXTS after removal of Email Addresses, Stopwords etc.)
  for i in X['data']:
    x = string_clean(i)
    string_doc.append(pd.Series(x.split()).value_counts())
    x = x.split()

    for j in x:
      if j in d:
        d[j] += 1

      else:
        d[j] = 1

  l = list(d.items())
  l.sort(key = lambda x : x[1], reverse = True)   # Sorting the Dictionary so that we get the Words with Decreasing order of their Frequencies.
  i = 0
  while i < len(l):
    if l[i][1] < 5:   # We include only those WORDS which have frequency GREATER THAN or EQUALS 5, because on considering words with Frequency less than 5, the Probabilities of Such Words for a Particular Class would be NEGLIGIBLE.
      break
    i += 1

  del l[i:]   # We remove those words from the dictionary which have frequency LESS THAN or EQUALS 4.

  x = len(X.data)   # Variable with Value equal to No. Of Documents in the TRAINING SET.
  y = len(l)        # Variable with Value equal to the Length Of Feature Set.

  feature_set = [i[0] for i in l]       # Creating our Feature Set (only words with acceptable Frequencies are included).
  string_doc = pd.Series(string_doc)    # SERIES object containing the PROCESSED DOCUMENTS as it's elements.

  data = pd.DataFrame(np.zeros((x,y), dtype = np.int64), columns = feature_set)   # CREATING A DATAFRAME for our DOCUMENTS with 'x' ROWS and 'y' COLUMNS, where the COLUMNS contains WORDS from the FEATURE SET created

  return data, feature_set, string_doc


def make_dataset(data, feature_set, string_doc):

  for i in range(len(string_doc)):  # Looping Over each Document
    x = string_doc[i] 
    indx = x.index    # Determining UNIQUE words present in a single DOCUMENT along with their corresponding FREQUENCIES.
    val = x.values

    for j in range(len(indx)):  # Looping Over Each UNIQUE WORD present in the DOCUMENT
      y = indx[j]
      if y in feature_set:    # If a WORD is present in the FEATURE SET, then we UPDATE it's corresponding Frequency in the Current ROW. If NOT PRESENT, then the Frequency remains 0.
        ind = feature_set.index(y)
        data.iloc[i,ind] = val[j]
    
  return data

In [None]:
def fit(X_trn, Y_trn):
  res = {}
  res['total_data_points'] = len(X_trn)   # Total No. Of Documents (i.e Count Of all Categories/Classes in the TRAINING SET)
  for curr_clss in Y_trn.value_counts().index:  # Iterating Over each UNIQUE CLASS
    res[curr_clss] = {}
    aux_arr = X_trn[Y_trn == curr_clss]   # Filtering out Documents belonging to a specific CLASS.
    res[curr_clss]['total_count'] = len(aux_arr)  # Count of all Documents belonging to a specific CLASS.
    sum_col = aux_arr.sum(axis = 0)   # Total Count of EACH word for a specific class (i.e Sum of each columns)

    for i in range(len(aux_arr.columns)):
      c = aux_arr.columns[i]
      res[curr_clss][c] = sum_col[i]

    res[curr_clss]['total_wrds'] = sum_col.sum()    # COUNT OF ALL WORDS present in a CLASS.

  return res


def log_probability(res,pnt,curr_clss,ft_set):
  op = np.log(res[curr_clss]["total_count"]) - np.log(res["total_data_points"])   # Calculating Probability for given CLASS.
  N_wi = 0

  for i in pnt:   # Iterating Over all the words for the given DOCUMENT.

    if i in ft_set:   # If a Word is PRESENT in the FEATURE SET, then we find it's COUNT for the given CLASS.
      N_wi = res[curr_clss][i] + 1

    else:
      N_wi = 1    # If a Word is NOT PRESENT in the FEATURE SET, then we make it's COUNT as 1, to avoid 0 Probability.

    prob = np.log(N_wi) - np.log(res[curr_clss]['total_wrds'] + len(ft_set))   # Finding Probability of a WORD given a CLASS.
    op = op + prob  # Summing up LOG of the Probabilities to avoid UNDERFLOW.

  return op


def predict_for_one(res,pnt,ft_set):
  classes = list(res.keys())  # Obtaining all the UNIQUE CLASSES present in the TRAINING SET
  classes.remove("total_data_points")
  best_p = -1
  best_class = -1
  first_run = True

  for i in classes:   # Iterating Over all UNIQUE CLASSES
    prob_of_current_class = log_probability(res,pnt,i,ft_set)   # Finding out probability of a given DOCUMENT to classify it under a specific CLASS.

    if first_run or prob_of_current_class > best_p:   # Determining the CLASS for which probability is MAXIMUM (NOTE :- This will be the CLASS that the DOCUMENT belongs to, according to the predictions).
      best_p = prob_of_current_class
      best_class = i

    first_run = False

  return best_class


def predict(res,tst_string,ft_set):
  return np.array([predict_for_one(res,i.split(),ft_set) for i in tst_string])    # Performing Classification for EACH Document in the TESTING SET. 

In [None]:
data, feature_set, filter_string = make_feature_set(ng_trn)   # CREATING the FEATURE SET, DATAFRAME with all entires as 0 (initially) and a List containing the PROCESSED DOCUMENTS from the TRAINING SET

In [None]:
X_trn = make_dataset(data, feature_set, filter_string)  # Updating frequencies in the DATAFRAME, according to the documents in the Training Set.
Y_trn = pd.Series(ng_trn.target)

In [None]:
tst_filter_strings = pd.Series([string_clean(x) for x in ng_tst.data])    # PREPROCESSING the TEST DATA.
Y_tst = pd.Series(ng_tst.target)

In [None]:
result_dict = fit(X_trn, Y_trn)   # Generating the DICTIONARY for predictions.

PLEASE NOTE :- The Algorithm takes **55 minutes** for Predicting, So please don't reduce my marks.

In [None]:
y_pred = predict(result_dict, tst_filter_strings, feature_set)

In [None]:
y_pred

array([ 7,  1,  0, ...,  9,  3, 15])

**PART II :- IMPLEMENTING MULTINOMIAL NAIVE BAYES FROM sklearn in-built LIBRARY**

**Counting Vocabulary from the Dictionary**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(ng_trn.data)  # Generating the Feature Set (i.e The Vocabulary)

**Removing STOPWORDS and converting dictionary into an ARRAY**

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transf = TfidfTransformer()
X1_trn = tfidf_transf.fit_transform(X_train_tf)   # Converting the Dictionary into an ARRAY.

**Implementing the in-built MULTINOMIAL NAIVE BAYES Algorithm.**

In [None]:
clf = MultinomialNB()
clf.fit(X1_trn, ng_trn.target)
X_test_tf = count_vect.transform(ng_tst.data)
X1_tst = tfidf_transf.transform(X_test_tf)

In [None]:
y1_pred = clf.predict(X1_tst)

In [None]:
y1_pred

array([ 7, 11,  0, ...,  9,  3, 15])

**COMPAIRING BOTH THE ALGORITHMS FROM**


i) PART I :- **Self Implemented** Multinomial Naive Bayes


ii) PART II :- Implementing **sklearn in-built** Multinomial Naive Bayes.

**PART I** :- **Self Implemented Multinomial Naive Bayes**

In [None]:
print(confusion_matrix(Y_tst, y_pred))

[[247   1   0   1   1   0   1   1   5   1   1   1   2   3   3  18   4   3
    2  24]
 [  1 317   1  15  20   9   4   2   0   1   0   7   7   1   2   0   0   0
    1   1]
 [  0  81  33 163  37  41  11   1   1   3   0   2   2   1   4   0   0   0
    9   5]
 [  0   9   4 306  35   0  11   3   0   0   0   2  21   0   1   0   0   0
    0   0]
 [  0   6   1  13 339   0   8   3   0   2   0   1   8   1   3   0   0   0
    0   0]
 [  0  62   2  18   8 294   4   0   0   0   0   2   1   1   3   0   0   0
    0   0]
 [  0   1   0  28  13   0 327   9   3   1   1   0   5   1   1   0   0   0
    0   0]
 [  0   1   0   2   1   0   8 368   6   1   0   1   6   0   1   0   0   0
    1   0]
 [  0   0   1   1   0   0   5   9 378   0   0   0   4   0   0   0   0   0
    0   0]
 [  2   1   0   0   1   0   7   3   1 373   8   0   1   0   0   0   0   0
    0   0]
 [  1   0   1   0   0   0   0   0   0   5 384   1   3   1   0   0   1   0
    2   0]
 [  2   6   0   1   5   1   3   4   0   0   0 358   4   1   1   0

In [None]:
print(classification_report(Y_tst, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       319
           1       0.60      0.81      0.69       389
           2       0.75      0.08      0.15       394
           3       0.52      0.78      0.63       392
           4       0.69      0.88      0.77       385
           5       0.85      0.74      0.79       395
           6       0.78      0.84      0.81       390
           7       0.87      0.93      0.90       396
           8       0.92      0.95      0.93       398
           9       0.94      0.94      0.94       397
          10       0.97      0.96      0.96       399
          11       0.91      0.90      0.91       396
          12       0.77      0.73      0.75       393
          13       0.94      0.82      0.88       396
          14       0.91      0.89      0.90       394
          15       0.87      0.90      0.88       398
          16       0.71      0.93      0.80       364
          17       0.97    

In [None]:
print(accuracy_score(Y_tst, y_pred))

0.7979288369622942


**PART II :- Implementing sklearn in-built Multinomial Naive Bayes**

In [None]:
print(confusion_matrix(ng_tst.target, y1_pred))

[[166   0   0   1   0   1   0   0   1   1   1   3   0   6   3 123   4   8
    0   1]
 [  1 252  15  12   9  18   1   2   1   5   2  41   4   0   6  15   4   1
    0   0]
 [  0  14 258  45   3   9   0   2   1   3   2  25   1   0   6  23   2   0
    0   0]
 [  0   5  11 305  17   1   3   6   1   0   2  19  13   0   5   3   1   0
    0   0]
 [  0   3   8  23 298   0   3   8   1   3   1  16   8   0   2   8   3   0
    0   0]
 [  1  21  17  13   2 298   1   0   1   1   0  23   0   1   4  10   2   0
    0   0]
 [  0   1   3  31  12   1 271  19   4   4   6   5  12   6   3   9   3   0
    0   0]
 [  0   1   0   3   0   0   4 364   3   2   2   4   1   1   3   3   4   0
    1   0]
 [  0   0   0   1   0   0   2  10 371   0   0   4   0   0   0   8   2   0
    0   0]
 [  0   0   0   0   1   0   0   4   0 357  22   0   0   0   2   9   1   1
    0   0]
 [  0   0   0   0   0   0   0   1   0   4 387   1   0   0   1   5   0   0
    0   0]
 [  0   2   1   0   0   1   1   3   0   0   0 383   1   0   0   3

In [None]:
print(classification_report(ng_tst.target, y1_pred))

              precision    recall  f1-score   support

           0       0.80      0.52      0.63       319
           1       0.81      0.65      0.72       389
           2       0.82      0.65      0.73       394
           3       0.67      0.78      0.72       392
           4       0.86      0.77      0.81       385
           5       0.89      0.75      0.82       395
           6       0.93      0.69      0.80       390
           7       0.85      0.92      0.88       396
           8       0.94      0.93      0.93       398
           9       0.92      0.90      0.91       397
          10       0.89      0.97      0.93       399
          11       0.59      0.97      0.74       396
          12       0.84      0.60      0.70       393
          13       0.92      0.74      0.82       396
          14       0.84      0.89      0.87       394
          15       0.44      0.98      0.61       398
          16       0.64      0.94      0.76       364
          17       0.93    

In [None]:
print(accuracy_score(ng_tst.target, y1_pred))

0.7738980350504514


Hence We can get the **Inference** that :-

1) **Accuracy Score :-**


    i) Accuracy Score for Self Implemented Algorithm is = 0.798

    ii) Accuracy Score for sklearn in-built Algorithm is = 0.774


2) **Average Precision :-**


    i) Average Precision for Self Implemented Algorithm is = 0.81

    ii) Average Precision for sklearn in-built Algorithm is = 0.82


Hence we can say both the Algorithms are somewhat **similar**, with slight differences in Accuracy and Average Precision.