In [1]:
# Reference: https://jakevdp.github.io/PythonDataScienceHandbook/05.05-naive-bayes.html

# Import all necessary libraries.
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


## Multinomial & Bernoulli Classification

In [2]:
# Features to training the classificator
X_train = np.array(['Chinese Beijing Chinese', 'Chinese Chinese Shanghai', 'Chinese Macao', 'Tokyo Japan Chinese'])

# Labels to training the classificator
y_train = np.array(['China','China','China','Not China'])

# Test of the classificator
X_test = np.array(['Chinese Chinese Chinese Tokyo Japan'])

## BernoulliNB Distribution

In [3]:
# Convert a collection of text documents to a matrix of token counts.
vect = CountVectorizer(binary=True)

# Fit and transform X_train into x_train_dtm (document-term matrices - dtm).
X_train_dtm = vect.fit_transform(X_train)

In [4]:
# Counting Matrix
print(vect.get_feature_names())
print(X_train_dtm.toarray())

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[1 1 0 0 0 0]
 [0 1 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]]


In [5]:
# Instantiate a Bernoulli Naive Bayes model.
model = BernoulliNB(binarize=None)

In [6]:
# train the model using X_train_dtm
model.fit(X_train_dtm, y_train)

BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)

In [7]:
# Transform document into document-term matrix.
X_test_dtm = vect.transform(X_test)

# Perform classification on an array of test vectors X_test_dtm.
y_pred_class = model.predict(X_test_dtm)

# Multinomial Counter Vector
print(X_test_dtm.toarray())

[[0 1 1 0 0 1]]


In [8]:
# Classification Result
print(y_pred_class)

['Not China']


## Bernoulli - Predict Class of Mesage (%)

In [9]:
print(((model.predict_proba(X_test_dtm))*100),'%')

[[19.10667888 80.89332112]] %


## Results
Há um vetor de teste que é igual a um vetor de treinamento da classe 'Not China' o que faz com que a o resultado seja classificado desta forma.

## MultinomialNB Distribution

In [10]:
# Features to training the classificator
X_train = np.array(['Chinese Beijing Chinese',
                   'Chinese Chinese Shanghai',
                   'Chinese Macao',
                   'Tokyo Japan Chinese'])

# Labels to training the classificator
y_train = np.array(['China','China','China','Not China'])

# Test of the classificator
X_test = np.array(['Chinese Chinese Chinese Tokyo Japan'])

In [11]:
# Instantiate a CountVectorizer object.
vect = CountVectorizer()

In [12]:
# Fit and transform X_train into x_train_dtm (document-term matrices - dtm).
X_train_dtm = vect.fit_transform(X_train)

print(vect.get_feature_names())
print(X_train_dtm.toarray())

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[1 2 0 0 0 0]
 [0 2 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]]


In [13]:
# Instantiate a Multinomial Naive Bayes model.
model = MultinomialNB()

In [14]:
# train the model using X_train_dtm
model.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
# Transform X_test into X_test_dtm.
X_test_dtm = vect.transform(X_test)

# Perform classification on an array of test vectors X_test_dtm.
y_pred_class = model.predict(X_test_dtm)

# Bernoulli Counter Vector
print(X_test_dtm.toarray())

[[0 3 1 0 0 1]]


In [16]:
# Classification Result
print(y_pred_class)

['China']


## Multinomial - Predict Class of Mesage (%)

In [17]:
print(((model.predict_proba(X_test_dtm))*100),'%')

[[68.97586118 31.02413882]] %


## Results
O vetor de teste para indicador positivo para a feature 'chinese' ocorre três vezes, sendo assim o peso é maior do que as ocorrências negativas das features 'japan' e 'tokyo'.

## Calculation of Probabilities - Bayes Theorem

Classes = 2 => China and Not China
Features = 6 => Beijing, Chinese, Japan, Macao, Shanhai, Tokyo

## Conditional Probabilities Calculation:

P(China|[Chinese, Chinese, Chinese, Tokyo, Japan]) => P(China|X)=PCHN_X

P(Chinese|China)=PCHN_C

P(Tokyo|China)=PTKY_C 

P(Japan|Chnia)=PJPN_C

P(China)=PC

P(Not China|[Chinese, Chinese, Chinese, Tokyo, Japan]) => P(Not China|X)=PNCHN_X

P(Chinese|Not China)=PCHN_NC 

P(Tokyo|Not China)=PTKY_NC

P(Japan|Not China)=PJPN_NC

P(Not China)=PNC


In [18]:
PCHN_C = (3+1)/(4+6)
PTKY_C = (0+1)/(4+6)
PJPN_C = (0+1)/(4+6)
PCHN = 3/5
PTKY = 1/5
PJPN = 1/5
PC = 4/6

print('PCHN_X:', (PCHN_C*PTKY_C*PJPN_C*PC)/(PCHN*PTKY*PJPN)*100,'%')

PCHN_X: 11.111111111111112 %


In [19]:
PCHN_NC = (0+1)/(2+6)
PTKY_NC = (1+1)/(2+6)
PJPN_NC = (1+1)/(2+6)
PNC = 2/6

print('PNCHN_X:', (PCHN_NC*PTKY_NC*PJPN_NC*PNC)/(PCHN*PTKY*PJPN)*100,'%')

PNCHN_X: 10.850694444444443 %
