In [1]:
import numpy as np
import pandas as pd
import re
import os
import operator
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]


In [4]:
# making the list of columns or extracting features
path ='20_newsgroups'                # download dataset from https://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups
documents =[]       # an array of tuples of words and category
for foldername in os.listdir(path):
    for filename in os.listdir(path+'\\'+foldername):
        with open((path+'\\'+foldername+'\\'+filename),'r') as f:
            text = f.read()
        # splitting the text by non character using regression
        words = re.split(r'\W+',text)
        output_words = []   # an array of clean words
        for w in words:
            if len(w) <= 3:     # ignoring words having length less than 4
                continue
            if w.lower() not in stop_words:
                output_words.append(w.lower())
        documents.append((output_words,foldername))   # appending cleaned words of doc with category of doc
        
random.shuffle(documents)    # shuffling


In [5]:
# splitting the data
training_documents = documents[:14000]
testing_documents = documents[14000:]

# building feature list
all_words = []           # an array of all words in training_documents
for doc in training_documents:
    all_words += doc[0]        

dictionary = {}      # for storing frequency of all_words
for word in all_words:         
    if word in dictionary:
        dictionary[word] += 1
    else:
        dictionary[word] = 1
        
# sorting the dictionary in deceasing order so that top 2000 words can be chosen
sorted_d = sorted(dictionary.items(), key=operator.itemgetter(1),reverse=True)     

feature_list=[]   # list of words/ vocabulary that will be the column of x        
for d in sorted_d[:2000]:    # choosing top 2000 words 
    feature_list.append(d[0])      # and only inserting words/vocab not the count


In [6]:
def get_dataset(documents):
    x = pd.DataFrame(data=np.zeros((len(documents),2000),dtype=int),columns=feature_list)
    y = []      # for y dataset
    i=-1       # for row index
    for document,category in documents:     # for every document
        i = i+1
        y.append(category)       # appending category
        for word in document:
            if word in feature_list:  # if word is in feature_list 
                x.loc[i,word] += 1           # increase count for that doc in x dataset for respective document/ row                  
    y = np.array(y) # converting from list to np array
    y = y.reshape(-1,)    # reshaping
    x = x.iloc[:].values   # taking x values
    return x,y

In [7]:
# building dataset
x_train,y_train = get_dataset(training_documents)  
x_test,y_test = get_dataset(testing_documents)

In [8]:
# text classification using sklearn Multinomial naive bayes
clf = MultinomialNB()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

In [9]:
# implementing using own naive bayes for text classification
def fit(X_train, Y_train):
    result = {}
    class_values = set(Y_train)   # classes of Y_train
    
    result["total_data"] = len(Y_train)   # no. of times all classes in Y_train
    for current_class in class_values:
        result[current_class] = {}       # for each feature will hold count words in current_class
        result[current_class]["total_count"] = 0   # total count of words in current_class
        
        current_class_rows = (Y_train == current_class)
        X_train_current = X_train[current_class_rows] # X_train having current_class
        Y_train_current = Y_train[current_class_rows] # Y_train having current_class
        result[current_class]['total_time_class']= len(Y_train_current)  # no. of times current_class in Y_train
        num_features = X_train.shape[1]   # total columns/features
        
        for j in range(0, num_features ):      # for every word
            result[current_class][j] = X_train_current[:,j].sum()   # count of words in current_class
            result[current_class]["total_count"] += result[current_class][j]         
    #print(result)      
    return result

In [10]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_time_class"]) - np.log(dictionary["total_data"])  # probability of y in current_class     
    num_features = len(dictionary[current_class].keys())-2   # -2 due to 'total_count' and 'total_time_class'  
    for j in range(0, num_features):  # for every feature/column
        if x[j] == 0:       # if x in testing data ... for current column is zero ..then skip that x column
            continue
        count_xj_within_current_class = dictionary[current_class][j] + 1         
        count_current_class = dictionary[current_class]["total_count"] + (len(dictionary[current_class].keys())-2)   # plus number of words in vocab         
        current_xj_probablity = np.log(count_xj_within_current_class) - np.log(count_current_class)            
        output = output + current_xj_probablity
    return output

In [11]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()    # list of all features  .. also have 'total_data' will ignore this      a
    best_p = -1000         # best probability 
    best_class = -1        # best y class having best probability 
    first_run = True        
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [12]:
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:       # for every row in X_test
        x_class = predictSinglePoint(dictionary, x)    #return predicted class of y   
        y_pred.append(x_class)
    return y_pred

In [13]:
dictionary = fit(x_train,y_train)
y_predict = predict(dictionary,x_test)

In [14]:
# comparison of score

In [15]:
print('Scoreboard of text classification\n')
print('Score using inbuilt implemented naive bayes in skearn-> ',clf.score(x_test,y_test))
print('Score using own implemented naive bayes-> ',np.mean(y_predict==y_test))

Scoreboard of text classification



Score using inbuilt implemented naive bayes in skearn->  0.835917958979
Score using own implemented naive bayes->  0.832749708187


In [16]:
# for inbuilt naive bayes
print(confusion_matrix(y_test,y_pred))
print('-------------------------------')
print(classification_report(y_test,y_pred))

[[234   0   0   0   0   0   0   3   3   1   0   0   1   1   1   5   0   1
    0  55]
 [  0 187  33  11  16  18   6   2   2   2   0   2  11   2   4   0   0   0
    0   0]
 [  0   1 267  18   2  10   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   3  19 195  68   2   9   0   0   0   0   2  11   0   0   0   0   0
    0   0]
 [  0   1  10  40 224   1   7   0   2   0   0   0   6   0   2   0   0   0
    0   0]
 [  0  15  40   5   3 233   2   0   1   0   0   0   3   2   0   0   0   0
    0   0]
 [  0   0   1   3   1   1 263   6   1   0   1   0   2   0   1   0   0   0
    0   0]
 [  0   0   0   1   0   0   6 296  10   0   0   0  10   0   1   1   2   0
    3   0]
 [  0   0   0   0   0   0   8   9 275   1   0   0   2   0   0   1   0   0
    0   0]
 [  0   0   0   1   0   0   2   5   1 287  13   0   0   0   1   1   0   0
    0   0]
 [  0   0   0   0   0   0   1   2   0   6 281   0   0   1   1   0   0   0
    0   0]
 [  1   1   0   0   1   0   1   0   2   0   0 278   1   1   1   0

In [17]:
# for own implemented naive bayes
print(classification_report(y_test,y_predict))
print('------------------------------')
print(confusion_matrix(y_test,y_predict))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.81      0.78       305
           comp.graphics       0.84      0.60      0.70       296
 comp.os.ms-windows.misc       0.69      0.86      0.76       298
comp.sys.ibm.pc.hardware       0.72      0.60      0.66       309
   comp.sys.mac.hardware       0.65      0.82      0.72       293
          comp.windows.x       0.89      0.76      0.82       304
            misc.forsale       0.81      0.94      0.87       280
               rec.autos       0.90      0.90      0.90       330
         rec.motorcycles       0.88      0.95      0.92       296
      rec.sport.baseball       0.95      0.94      0.94       311
        rec.sport.hockey       0.94      0.95      0.94       292
               sci.crypt       0.98      0.95      0.96       293
         sci.electronics       0.76      0.89      0.82       299
                 sci.med       0.89      0.79      0.84       274
         

[[247   2   0   0   0   1   1   2   4   3   0   0   0   3   0   2   0   0
    0  40]
 [  0 179  42  12  22  13   6   1   0   0   0   2  14   4   1   0   0   0
    0   0]
 [  0   3 256  19   8  10   1   0   0   0   0   0   0   1   0   0   0   0
    0   0]
 [  0   3  16 186  84   0   6   1   0   0   0   1  12   0   0   0   0   0
    0   0]
 [  0   1   5  32 239   2   6   1   0   0   0   0   5   1   1   0   0   0
    0   0]
 [  0  13  47   2   5 231   1   0   0   0   0   1   1   2   1   0   0   0
    0   0]
 [  0   0   2   3   2   0 264   6   0   0   0   0   1   1   1   0   0   0
    0   0]
 [  0   0   0   0   0   0   8 297   8   0   0   0  11   1   0   0   4   0
    1   0]
 [  0   0   0   0   0   0   7   6 282   0   0   0   0   0   1   0   0   0
    0   0]
 [  0   0   0   0   0   0   1   1   1 293  15   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   2   2   1  10 276   0   0   0   0   0   0   0
    1   0]
 [  0   0   0   0   0   0   1   0   2   0   0 277   5   0   1   0