In [1]:
import numpy  as np
import pandas as pd

In [2]:
import os
# To work with files and folders

In [3]:
base = r".\20_newsgroups"
# The address of the base folder

In [4]:
folders = [f for f in os.listdir(base) if not f.startswith('.')]
folders
# These are the different folders in the 20_newsgroups directory

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
block_words = ['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still', 
            "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'",'newsgroups', 'xref', 'path', 
            'from', 'subject', 'sender', 'organisation', 'apr','gmt', 'last','better','never','every','even','two',
            'good','used','first','need','going','must','really','might','well','without','made','give','look',
            'try','far','less','seem','new','make','many','way','since','using','take','help','thanks','send',
            'free','may','see','much','want','find','would','one','like','get','use','also','could','say','us',
            'go','please','said','set','got','sure','come','lot','seems','able','anything','put', '--', '|>', '>>',
            '93', 'xref', 'cantaloupe.srv.cs.cmu.edu', '20', '16', '21', '19', '10', '17', '24', 'reply-to:', 'thu',
            'nntp-posting-host:', 're:','25''18'"i'd"'>i''22''fri,''23''>the','references:','xref:','sender:',
            'writes:','1993']


In [7]:
import string
stop_words += block_words
stop_words += list(string.punctuation)
# We create a list of stop words and block words, i.e words that we won't use in our vocaubulary, 
# because they are very common, or do not help us
# We will also add punctuations in stop words

### Preparing the data

In [8]:
data = []
for folder in folders:
    files = os.listdir(os.path.join(base, folder))
    for file in files:
        file_path = os.path.join(base, folder, file)
        with open(file_path, 'r', errors='ignore') as fileobj:
            # We go through all the files
            data.append((' '.join([word.lower() for word in fileobj.read().strip().split() if
                                  not word.lower() in stop_words and len(word.lower()) > 1]), folder))
            # For every file, we remove the stop words, single words, and extra spaces
            # Then join the text back together and store it in an n x 2 array for all the data

In [9]:
data_df = pd.DataFrame(data, columns = ['text', 'category'])
# We convert it into a dataframe

In [10]:
data_df

Unnamed: 0,text,category
0,alt.atheism:49960 alt.atheism.moderated:713 ne...,alt.atheism
1,alt.atheism:51060 alt.atheism.moderated:727 ne...,alt.atheism
2,alt.atheism cantaloupe.srv.cs.cmu.edu!crabappl...,alt.atheism
3,alt.atheism:51120 alt.politics.usa.constitutio...,alt.atheism
4,alt.atheism:51121 soc.motss:139944 rec.scoutin...,alt.atheism
...,...,...
19992,alt.atheism:54482 talk.religion.misc:84566 alt...,talk.religion.misc
19993,alt.atheism:54485 talk.religion.misc:84567 tal...,talk.religion.misc
19994,talk.religion.misc:84568 talk.politics.misc:18...,talk.religion.misc
19995,talk.religion.misc:84569 talk.religion.newage:...,talk.religion.misc


### Splitting the data

In [11]:
from sklearn import model_selection
train_df, test_df = model_selection.train_test_split(data_df, random_state = 0)
# Splitting the data into two different dataframes, training and testing

In [12]:
print(train_df.shape,test_df.shape)

(14997, 2) (5000, 2)


In [13]:
test_df['category'].value_counts()
# We can see that the data is evenly divided, Which will be helpful later

rec.motorcycles             284
talk.politics.mideast       281
rec.autos                   269
misc.forsale                261
talk.politics.misc          259
sci.med                     256
comp.graphics               253
soc.religion.christian      252
comp.os.ms-windows.misc     249
talk.politics.guns          249
rec.sport.baseball          248
sci.space                   246
sci.electronics             244
comp.windows.x              240
comp.sys.ibm.pc.hardware    240
talk.religion.misc          236
comp.sys.mac.hardware       236
sci.crypt                   233
alt.atheism                 233
rec.sport.hockey            231
Name: category, dtype: int64

### Building the Vocabulary (From training data)

In [14]:
vocab_basic = {}
for row in train_df.values:
    for word in row[0].split():
        vocab_basic[word] = vocab_basic.get(word, 0) + 1
        # We go through our training data, and build a dictionary of all the words and their occurances

In [15]:
from heapq import nlargest 
num_features = 2500
most_common_words = nlargest(num_features, vocab_basic, key = vocab_basic.get) 
# Next we use a heap to get the top 2500 most used words,
# We will use these words as features for our data
vocab = { word:vocab_basic[word] for word in most_common_words} 
# We build the final vocabulary (as a dictionary named vocab)

In [16]:
print('50 of the most common words: \n\n')
for i in list(vocab.keys())[:50]:
    print(i, ' \t\t',vocab[i])

50 of the most common words: 


know  		 5602
i'm  		 4314
time  		 3261
it.  		 3127
computer  		 2375
something  		 2345
i've  		 2319
system  		 2245
god  		 2239
15  		 2177
news  		 2142
back  		 2112
can't  		 2098
state  		 2095
work  		 2009
someone  		 1978
>in  		 1940
23  		 1883
problem  		 1878
government  		 1870
another  		 1865
information  		 1865
read  		 1862
usa  		 1847
>the  		 1831
number  		 1812
things  		 1782
that's  		 1771
part  		 1753
22  		 1727
point  		 1714
>i  		 1708
tue,  		 1704
little  		 1701
fri,  		 1685
windows  		 1665
file  		 1607
data  		 1579
probably  		 1575
years  		 1575
space  		 1570
long  		 1550
question  		 1547
tell  		 1538
(usenet  		 1535
different  		 1530
around  		 1524
public  		 1523
available  		 1521
it,  		 1518


### Converting the data into 2D Array

In [17]:
# First we need to figure out the columns of the 2D array
cols = list(vocab.keys())
cols.sort()
# print(columns)
# Now we have the columns (in sorted order), Let us create the 2D matrix of x
# Our output classes will be from the folders array

In [18]:
def convert_data(text,category,features):
    # We get the text, the category and the features used
    X = []
    Y = []
    for i in range(len(text)):
        current_x = [0 for _i in range(len(features))]
        for word in text[i].split():
            # We check if the word is a column/feature in our 2D array, If it is then we increment is by 1 at that index
            if word in features:
                current_x[features.index(word)] += 1 # Go to the column where we have kept that word, and increment it
        X.append(current_x) # Add the current row (for current file) to the main 2D matrix
        Y.append(category[i]) # Append the class for current file
    np_x = np.array(X, dtype = int)
    np_y = np.array(Y, dtype = str)
    return np_x, np_y

In [19]:
x_train, y_train = convert_data(train_df['text'].values, train_df['category'].values, cols)

In [20]:
x_test,  y_test  = convert_data(test_df['text'].values,  test_df['category'].values,  cols)

In [21]:
# We have now converted the data into a x, a 2D array , and y, the categories
# The features used are top 2500 'most common words'
# Let us visualize our 2D Array
df_xtrain = pd.DataFrame(x_train, columns = cols)
df_xtrain
# Most of the data is zero,since this dataframe is made from a sparse matrix

Unnamed: 0,"""a","""i","""if","""in","""it","""the","""we","""what","""you",#1,...,"yet,",yet.,york,"york,","you,",you.,you?,young,yourself.,||
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14992,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
14993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14995,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Implement Multinomial Naive Bayes from SKLearn

In [22]:
# Using the inbuild Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred_sklearn = clf.predict(x_test)
train_score = clf.score(x_train, y_train)
test_score = clf.score(x_test, y_test)
print("Train Score\t", train_score, "\nTest Score\t", test_score)

Train Score	 0.8979129159165167 
Test Score	 0.8432


### Implementing Multinomial Naive Bayes from scratch

In [23]:
def fit(x_train, y_train):
    # Dictionary that we will use to calculate the probabilities while predicting
    result = {}
    result['total_data'] = y_train.shape[0]
    for current_class in set(y_train):
        # Since we need to go over unique values only, we use a set
        result[current_class] = {}
        
        # Lets get the part of array which is useful
        x_train_current = x_train[y_train == current_class]
        y_train_current = y_train[y_train == current_class]
        
        # Lets keep a count of total_words, since we need to store that value as well
        total_words = 0
        for j in range(num_features):
            # Now we go over all the features, and keep a sum of the words
            result[current_class][j] = x_train_current[:,j].sum()
            total_words += result[current_class][j]
        # Total_count keeps the track of total words, for an entire category
        result[current_class]['total_count'] = total_words
        
    return result

In [24]:
def getProb(current_class, x, dictionary):
    # We initialize the ans with log probability of p(class = current_class) 
    ans = np.log(dictionary[current_class]['total_count']) - np.log(dictionary['total_data'])
    # We go over all the features
    for i in range(num_features):
        # We calculate the occurance of current word
        current_word = dictionary[current_class][i] + 1
        # And the occurance of total words
        total_words = dictionary[current_class]['total_count'] + len(x)
        # And calculate the log probability of the current word
        current_word_prob = np.log(current_word) - np.log(total_words)
        # We add the probability of current word, as many times as the word occurs
        for j in range(x[i]):
            ans += current_word_prob
    return ans

In [25]:
def predictSinglePoint(x, dictionary):
    best_class = None
    best_prob = None
    first = True
    all_classes = dictionary.keys()
    # We go over all the classes 
    for current_class in all_classes:
        if current_class == 'total_data':
            continue
        # We then calculate the probability of each class
        current_prob = getProb(current_class, x, dictionary)
        # We choose the class with the highest probability
        if first is True or current_prob > best_prob:
            best_prob = current_prob
            best_class = current_class
        first = False
    return best_class

In [26]:
def predict(x_test, dictionary):
    y_predicted = []
    for current_x in x_test:
        # We go over all the dataset one by one and predict the output for every single point
        y_predicted.append(predictSinglePoint(current_x, dictionary))
    return np.array(y_predicted)

In [27]:
# Training the model on the given data
dictionary = fit(x_train, y_train)

In [28]:
# Predicting using the model
y_pred_self = predict(x_test, dictionary)

### Comparision between Sklearn Multinomial Naive Bayes and Implementation from scratch

In [29]:
from sklearn.metrics import classification_report
print("Classification report for sklearn MultinomialNB()",classification_report(y_test, y_pred_sklearn))
print("Classification report for self-implemented Naive Bayes ",classification_report(y_test, y_pred_self))

Classification report for sklearn MultinomialNB()                           precision    recall  f1-score   support

             alt.atheism       0.72      0.81      0.76       233
           comp.graphics       0.79      0.76      0.77       253
 comp.os.ms-windows.misc       0.82      0.87      0.84       249
comp.sys.ibm.pc.hardware       0.82      0.85      0.84       240
   comp.sys.mac.hardware       0.86      0.90      0.88       236
          comp.windows.x       0.91      0.81      0.86       240
            misc.forsale       0.83      0.84      0.83       261
               rec.autos       0.86      0.91      0.89       269
         rec.motorcycles       0.88      0.94      0.91       284
      rec.sport.baseball       0.92      0.98      0.95       248
        rec.sport.hockey       0.96      0.94      0.95       231
               sci.crypt       0.94      0.90      0.92       233
         sci.electronics       0.81      0.86      0.83       244
                 sci.med 