# Multinomial Naive Bayes
the simple design of Naive Bayes classifiers make them very attractive for such classifiers. Moreover, they have been demonstrated to be fast, reliable and accurate in a number of applications of NLP.

In [3]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
train_df = pd.DataFrame({
    'Word':['Chinese Beijing Chinese',
            'Chinese Chinese Shanghai',
            'Chinese Macao',
            'Tokyo Japan Chinese'],
    'Class':[1,1,1,0]
}) 

test_df = pd.DataFrame({
    'Word':['Chinese Chinese Chinese Tokyo Japan']
})

print(train_df)
print(test_df['Word'])

                       Word  Class
0   Chinese Beijing Chinese      1
1  Chinese Chinese Shanghai      1
2             Chinese Macao      1
3       Tokyo Japan Chinese      0
0    Chinese Chinese Chinese Tokyo Japan
Name: Word, dtype: object


<img src="https://i0.wp.com/syncedreview.com/wp-content/uploads/2017/07/screenshot-from-2017-07-11-16-13-25.png?resize=325%2C53&ssl=1"/>

where the pseudocount α > 0 is the smoothing parameter, and d is specific vocabulary in data

N is number of word in class

In [74]:
P1 = 3/4 #proba of class 1 / all class
P0 = 1/4 #proba of class 0 / all class

# defined α = 1 
α = 1
PChinese_1 = (5+α)/(8+6) # how many Chinese in class 1
PTokyo_1 = (0+α)/(8+6)
PJapan_1 = (0+α)/(8+6)
PChinese_0 = (1+α)/(3+6) # how many Chinese in class 0
PTokyo_0 = (1+α)/(3+6)
PJapan_0 = (1+α)/(3+6)

#prediction >> 'Chinese Chinese Chinese Tokyo Japan'
P_1 = P1 * PChinese_1**3 * PTokyo_1 * PJapan_1
P_0 = P0 * PChinese_0**3 * PTokyo_0 * PJapan_0
in_class = {P_1:1,P_0:0}[max(P_1, P_0)]

print(f'''
this sentence "Chinese Chinese Chinese Tokyo Japan" 
look like:
    class 1 = {P_1:.4f}
    class 0 = {P_0:.4f}
    then in class = {in_class}
''')


this sentence "Chinese Chinese Chinese Tokyo Japan" 
look like:
    class 1 = 0.0003
    class 0 = 0.0001
    then in class = 1



### with Sckitlearn

In [54]:
tokenizer = TfidfVectorizer()
train_sentences = tokenizer.fit_transform(train_df['Word'])
model = MultinomialNB()
model.fit(train_sentences, train_df['Class'])

in_class = model.predict_proba(tokenizer.transform(test_df['Word']))[0]
print(f'''
this sentence "Chinese Chinese Chinese Tokyo Japan" 
look like:
    class 1 = {in_class[1]:.4f}
    class 0 = {in_class[0]:.4f}
    then in class = {np.argmax(in_class)}
''')


this sentence "Chinese Chinese Chinese Tokyo Japan" 
look like:
    class 1 = 0.6704
    class 0 = 0.3296
    then in class = 1

