In [47]:
# Reference: https://jakevdp.github.io/PythonDataScienceHandbook/05.05-naive-bayes.html

# Import all necessary libraries.
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


## Multinomial & Bernoulli Classification

In [48]:
# Features to training the classificator
X_train = np.array(['Chinese Beijing Chinese', 'Chinese Chinese Shanghai', 'Chinese Macao', 'Tokyo Japan Chinese'])

# Labels to training the classificator
y_train = np.array(['China','China','China','Not China'])

# Test of the classificator
X_test = np.array(['Chinese Chinese Chinese Tokyo Japan'])

## BernoulliNB Distribution

In [49]:
# Convert a collection of text documents to a matrix of token counts.
vect = CountVectorizer(binary=True)

# Fit and transform X_train into x_train_dtm (document-term matrices - dtm).
X_train_dtm = vect.fit_transform(X_train)

In [50]:
# Counting Matrix
print(vect.get_feature_names())
print(X_train_dtm.toarray())

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[1 1 0 0 0 0]
 [0 1 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]]


In [51]:
# Instantiate a Bernoulli Naive Bayes model.
model = BernoulliNB(binarize=None)

In [52]:
# train the model using X_train_dtm
model.fit(X_train_dtm, y_train)

BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)

In [53]:
# Transform document into document-term matrix.
X_test_dtm = vect.transform(X_test)

# Perform classification on an array of test vectors X_test_dtm.
y_pred_class = model.predict(X_test_dtm)

# Multinomial Counter Vector
print(X_test_dtm.toarray())

[[0 1 1 0 0 1]]


In [54]:
# Classification Result
print(y_pred_class)

['Not China']


## Bernoulli - Predict Class of Mesage (%)

In [55]:
print(((model.predict_proba(X_test_dtm))*100),'%')

[[19.10667888 80.89332112]] %


## Explicação
O vetor de teste é idêntico ao quarto vetor de treinamento, o qual pertence à classe 'not-china', e portanto, ele é consequentemente classificado como tal.

## MultinomialNB Distribution

In [56]:
# Features to training the classificator
X_train = np.array(['Chinese Beijing Chinese',
                   'Chinese Chinese Shanghai',
                   'Chinese Macao',
                   'Tokyo Japan Chinese'])

# Labels to training the classificator
y_train = np.array(['China','China','China','Not China'])

# Test of the classificator
X_test = np.array(['Chinese Chinese Chinese Tokyo Japan'])

In [57]:
# Instantiate a CountVectorizer object.
vect = CountVectorizer()

In [65]:
# Fit and transform X_train into x_train_dtm (document-term matrices - dtm).
X_train_dtm = vect.fit_transform(X_train)

print(vect.get_feature_names())
print(X_train_dtm.toarray())

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[1 2 0 0 0 0]
 [0 2 0 0 1 0]
 [0 1 0 1 0 0]
 [0 1 1 0 0 1]]


In [66]:
# Instantiate a Multinomial Naive Bayes model.
model = MultinomialNB()

In [67]:
# train the model using X_train_dtm
model.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [69]:
# Transform X_test into X_test_dtm.
X_test_dtm = vect.transform(X_test)

# Perform classification on an array of test vectors X_test_dtm.
y_pred_class = model.predict(X_test_dtm)

# Bernoulli Counter Vector
print(X_test_dtm.toarray())

[[0 3 1 0 0 1]]


In [70]:
# Classification Result
print(y_pred_class)

['China']


## Multinomial - Predict Class of Mesage (%)

In [71]:
print(((model.predict_proba(X_test_dtm))*100),'%')

[[68.97586118 31.02413882]] %


## Explicação
O motivo dessa classificação é que as três ocorrências do indicador positivo 'chinese' no vetor de test superam (tem maior peso) do que as ocorrências dos dois indicadores negativos 'japan' e 'tokyo'

## Calculation of BernoulliNB probabilities

In [1]:
pct = (4/5)*(1/5)*(1/5)*(1-2/5)*(1-2/5)*(1-2/5)*(3/4)
print(pct)

0.005184000000000001


In [2]:
pnct = (2/3)*(2/3)*(2/3)*(1-1/3)*(1-1/3)*(1-1/3)*(1/4)
print(pnct)

0.02194787379972566


In [3]:
pctn = pct / (pct + pnct)
print((pctn*100),'%')

19.106678876165265 %


In [4]:
pnctn = pnct / (pct + pnct)
print((pnctn*100),'%')

80.89332112383474 %
