In [1]:
# Author - Gowtham Ch
# https://www.linkedin.com/in/gauthamchowta/

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelBinarizer
import pandas as pd

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [3]:
columns = ['sent', 'class']
rows = []

rows = [['This is my book', 'stmt'], 
        ['They are novels', 'stmt'],
        ['have you read this book', 'question'],
        ['who is the author', 'question'],
        ['what are the characters', 'question'],
        ['This is how I bought the book', 'stmt'],
        ['I like fictions', 'stmt'],
        ['what is your favorite book', 'question']]

df = pd.DataFrame(rows, columns=columns)
df

Unnamed: 0,sent,class
0,This is my book,stmt
1,They are novels,stmt
2,have you read this book,question
3,who is the author,question
4,what are the characters,question
5,This is how I bought the book,stmt
6,I like fictions,stmt
7,what is your favorite book,question


In [4]:
def convert_to_BOW(corpus):
    
    # Given text return the BOW representation of the words
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    name_index = dict([(name,index) for index,name in enumerate(vectorizer.get_feature_names())])
    return X.toarray(),name_index

In [5]:
def count_based_on_class(X,y):
    
    y  = np.array(y)
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    if y.shape[1] == 1:
        y = np.concatenate((1 - y, y), axis=1)
    
    #Counts
    count_matrix = np.matmul(y.T,X)
    class_count = y.sum(axis=0)
    return count_matrix,y,lb.classes_,class_count
    

In [6]:
def feature_log_probabilities(count_matrix,alpha=1):
    # Adding alpha to the count
    print('Count Matrix')
    print(count_matrix)
    smoothed_version = count_matrix+alpha
    # Calculating the number of words in a given class
    print('Smoothed version')
    print(smoothed_version)
    den = smoothed_version.sum(axis = 1)
    # Reshaping it to 2D column
    den = den.reshape(-1,1)
    print('Denominator - total words present in a given class')
    print(den)
    # probability is num/den -- log probability is log(num)- log(den)
    log_probabilities = np.log(smoothed_version)-np.log(den)
    
    return log_probabilities

In [7]:
def calculate_prior_probs(class_count):
    
    num = class_count
    den = class_count.sum()
    
    return np.log(num)-np.log(den)
    

In [8]:
def predict(query_point,log_probabilities,prior_probabilities,classes):
    output = np.matmul(log_probabilities,query_point.T) + prior_probabilities
    index = np.argmax(output)
    return classes[index]

In [9]:
X,name_index = convert_to_BOW(df.sent)
count_matrix,y,classes,class_count = count_based_on_class(X,df['class'])

log_probabilities = feature_log_probabilities(count_matrix,alpha = 1)
prior_probabilities = calculate_prior_probs(class_count)

output = predict(X[2],log_probabilities,prior_probabilities,classes)

print('Predicted class - ',output)
print('Actual class -',df['class'][2])

Count Matrix
[[1 1 2 0 1 1 0 1 0 2 0 0 0 1 2 0 1 2 1 1 1]
 [1 0 2 1 0 0 1 0 1 2 1 1 1 0 1 1 2 0 0 0 0]]
Smoothed version
[[2 2 3 1 2 2 1 2 1 3 1 1 1 2 3 1 2 3 2 2 2]
 [2 1 3 2 1 1 2 1 2 3 2 2 2 1 2 2 3 1 1 1 1]]
Denominator - total words present in a given class
[[39]
 [36]]
Predicted class -  question
Actual class - question


In [10]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, df['class'])
print('Sklearn feature log-probabilities\n',clf.feature_log_prob_)
print('Manually implemented probabilities\n',log_probabilities)
print('Difference between actual and expected implementation\n'
      ,log_probabilities-clf.feature_log_prob_)
print()
print('Sklearn predict',clf.predict(X[4:5]))
print('Manual predict',predict(X[4:5],log_probabilities,prior_probabilities,classes))

Sklearn feature log-probabilities
 [[-2.97041447 -2.97041447 -2.56494936 -3.66356165 -2.97041447 -2.97041447
  -3.66356165 -2.97041447 -3.66356165 -2.56494936 -3.66356165 -3.66356165
  -3.66356165 -2.97041447 -2.56494936 -3.66356165 -2.97041447 -2.56494936
  -2.97041447 -2.97041447 -2.97041447]
 [-2.89037176 -3.58351894 -2.48490665 -2.89037176 -3.58351894 -3.58351894
  -2.89037176 -3.58351894 -2.89037176 -2.48490665 -2.89037176 -2.89037176
  -2.89037176 -3.58351894 -2.89037176 -2.89037176 -2.48490665 -3.58351894
  -3.58351894 -3.58351894 -3.58351894]]
Manually implemented probabilities
 [[-2.97041447 -2.97041447 -2.56494936 -3.66356165 -2.97041447 -2.97041447
  -3.66356165 -2.97041447 -3.66356165 -2.56494936 -3.66356165 -3.66356165
  -3.66356165 -2.97041447 -2.56494936 -3.66356165 -2.97041447 -2.56494936
  -2.97041447 -2.97041447 -2.97041447]
 [-2.89037176 -3.58351894 -2.48490665 -2.89037176 -3.58351894 -3.58351894
  -2.89037176 -3.58351894 -2.89037176 -2.48490665 -2.89037176 -2.890371

In [11]:
clf.class_log_prior_

array([-0.69314718, -0.69314718])

In [12]:
prior_probabilities

array([-0.69314718, -0.69314718])