# Naive Bayes

### Construindo um Classificador Naive Bayes em Python

Estamos construindo um classificador Naive Bayes com BernoulliNB e MultinomialNB em Python. Não usaremos as funções do Scikit-learn.

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import logging
import sys
from time import time
from math import *
from matplotlib import pyplot as pl
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
class MyBernClassifier():
  
    def __init__(self, smooth = 1):
        self._smooth = smooth 
        self._feat_prob = []
        self._class_prob = []
        self._Ncls = []
        self._Nfeat = []

    def train(self, X, y):
        print ("Treinando Bernoulli NB...")
        count_each_class = {}
        feature_count = {}
        alpha = self._smooth 
        temp = []
        temp.append(np.unique(y))
        self._Ncls.append(temp[0].size) # Número total de classes
        self._Nfeat.append(X[0].size)  # Número total de features
        
        for i in range(y.size):
            if y[i] in feature_count:
                continue
            else:
                feature_count[y[i]] = [0 for w in range (X[i].size)]
                

        # Conta os atributos para cada classe através do treinamento ou 
        # conta a ocorrência de cada classe através do treinamento
        for i in range (y.size):
            if y[i] in count_each_class:
                count_each_class[y[i]] +=1
            else:
                count_each_class[y[i]] = 1
            for j in  range(X[i].size):
                    feature_count[y[i]][j] += X[i][j]
                    
        # Calcula probabilidades de classe e atributos para cada classe      
        for cls in feature_count:
            
            num = (self._smooth+count_each_class[cls])
            din = (y.size+(self._Ncls[0]*self._smooth))
            self._class_prob.append((num/float(din)))
            ar = np.array([])
            for j in  range(X[i].size):
                
                num = (feature_count[cls][j] + self._smooth)
                din = (count_each_class[cls]+(2*self._smooth))
                ar = np.append(ar,(num/float(din)))
            self._feat_prob.append(ar)
    

    def predict(self, X):
        
        print ("Fazendo Previsões com Bernoulli NB...")
        
        Y_predict = np.array([])

        for i in X:
            neg_log_prob = 0
            minimum_neg_log_prob = 999999999999999
            category = 0  
                
            for cls in range(self._Ncls[0]):
                neg_log_prob = -log(self._class_prob[cls])
                for j in  range(self._Nfeat[0]):  
                    if (i[j])==0:
                        neg_log_prob -= log(1-self._feat_prob[cls][j])
                    else:
                        neg_log_prob -= log(self._feat_prob[cls][j])
                        
                if minimum_neg_log_prob>neg_log_prob:
                    category=cls
                    minimum_neg_log_prob=neg_log_prob
            
            Y_predict=np.append(Y_predict,category)
         
        return Y_predict

In [3]:
class MyMultinomialBayesClassifier():
    
    def __init__(self, smooth = 1):
        self._smooth = smooth 
        self._feat_prob = []
        self._class_prob = []
        self._class_neg_prob = []
        self._Ncls = []
        self._Nfeat = []

    def train(self, X, y):
        print ("Treinando Multinomial NB...")
        
        count_each_class = {}
        feature_count = {}
      
        for i in range(y.size):
            if y[i] in feature_count:
                continue
            else:
                feature_count[y[i]] = [0 for w in range (X[i].size)]
                
        for i in range (y.size):
            if y[i] in count_each_class:
                count_each_class[y[i]] +=1
            else:
                count_each_class[y[i]] = 1
            for j in  range(X[i].size):
                    feature_count[y[i]][j] += X[i][j]
                
        alpha = self._smooth 
        temp = []
        temp.append(np.unique(y))
        self._Ncls.append(temp[0].size)
        self._Nfeat.append(X[0].size)  
        self._class_prob.append(count_each_class)
        self._feat_prob.append(feature_count)
        
        
    
    def predict(self, X):
        
        print ("Fazendo Previsões com Multinomial NB...")
        
        Y_predict = np.array([])
        
        # Calcula o total de classes para os dados de treino
        total_train_count = 0
        for key in self._class_prob[0]:
            total_train_count += self._class_prob[0][key]
        
        for i in X:
            neg_log_prob = 0
            minimum_neg_log_prob=999999999999999
            category = 0
            
            for cls in self._feat_prob[0]:
                Ny = sum(self._feat_prob[0][cls])
                neg_log_prob = -log((self._class_prob[0][cls]+1)/float(total_train_count+(self._Ncls[0]*self._smooth)))
                for j in  range(self._Nfeat[0]):  
                    if (i[j])==0:
                        continue    
                    for itere in range (i[j]):
                        num = (self._smooth+self._feat_prob[0][cls][j])
                        din = (Ny+(self._Nfeat[0]*self._smooth))
                        neg_log_prob -= log(num/float(din))
                        
                if minimum_neg_log_prob>neg_log_prob:
                    category=cls
                    minimum_neg_log_prob=neg_log_prob
            
            Y_predict=np.append(Y_predict,category)
         
        return Y_predict

In [4]:
# Define as classes que serão usadas no processo de classificação
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
remove = ('headers', 'footers', 'quotes')

In [5]:
# Carrega os dados
data_train = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = 42, remove = remove)
data_test = fetch_20newsgroups(subset = 'test', categories = categories, shuffle = True, random_state = 42, remove = remove)
print('Dados Carregados!')

Dados Carregados!


In [6]:
# Treino e Teste
y_train, y_test = data_train.target, data_test.target

In [7]:
print("Extraindo as features do dataset de treino usando o count vectorizer")
t0 = time()

Extraindo as features do dataset de treino usando o count vectorizer


In [8]:
# Binary = true for Bernoulli NB
vectorizer = CountVectorizer(stop_words = 'english', binary = True)
X_train = vectorizer.fit_transform(data_train.data).toarray()
X_test = vectorizer.transform(data_test.data).toarray()
feature_names = vectorizer.get_feature_names()

In [9]:
# For Bernoulli NB, Binary = true, train for one default smooth value alpha = 1

print ('-------------------------------------------------------------')
print ('Tempo esperado para execução do modelo Bernoulli NB é 180 seg')
ta = time()
alpha = 1
clf = MyBernClassifier(alpha)
clf.train(X_train,y_train)
y_pred = clf.predict(X_test)
tb = time()
print ("Para o modelo Bernoulli NB:  " +'alpha = %f, accuracy = %f' %(alpha, np.mean((y_test - y_pred)==0)))
print ("Tempo total para treinar e prever com o modelo Bernoulli: " + str(tb-ta))
print ('-------------------------------------------------------------')


-------------------------------------------------------------
Tempo esperado para execução do modelo Bernoulli NB é 180 seg
Treinando Bernoulli NB...
Fazendo Previsões com Bernoulli NB...
Para o modelo Bernoulli NB:  alpha = 1.000000, accuracy = 0.310421
Tempo total para treinar e prever com o modelo Bernoulli: 150.44519805908203
-------------------------------------------------------------


### Esta célula pode levar horas para ser executada!

In [None]:
# Bernoulli Naive bayes: Alpha Vs accuracy
acc = []
alp = []

for alpha in [float(j) / 100 for j in range(1, 101, 1)]:
    print ('-----------------------------------------------------------------------------------------------------')
    ta = time()
    clf = MyBernClassifier(alpha)
    clf.train(X_train,y_train)
    y_pred = clf.predict(X_test)
    acc.append(np.mean((y_test-y_pred)==0))
    alp.append(alpha)
    tb = time()
    print ("Tempo de treinamento: " + str(tb-ta) + " acurácia, alpha is: " + str(np.mean((y_test-y_pred)==0)) +","+str(alpha))

# Plotting 
with PdfPages('Bernoulli.pdf') as pdf:
    pl.plot(alp,acc,marker='.', linestyle = '-', color = 'r')
    pl.ylabel('Acurácia',color='g')
    pl.xlabel('Alpha',color='g')
    pl.title('Plot Alpha Vs Acurácia para Bernoulli NB',color = 'r')
    pdf.savefig() 
    pl.close()

# Print
print ("Acurácia máxima do modelo Bernoulli NB is: " + str(max(acc)))
print ("com valor correspondente para alpha de:       " + str(alp[(acc.index(max(acc)))]))

In [None]:
# Binary = false para Multinomial NB

print ("Extraindo dados com Binary = False para Multinomial NB")
vectorizer = CountVectorizer(stop_words = 'english', binary = False)
X_train = vectorizer.fit_transform(data_train.data).toarray()
X_test = vectorizer.transform(data_test.data).toarray()
feature_names = vectorizer.get_feature_names()

print ('Tempo total esperado para execução do Multinomial NB é 90 seg')
ta = time()
alpha = 1
clf1 = MyMultinomialBayesClassifier(alpha)
clf1.train(X_train,y_train)
y_pred = clf1.predict(X_test)

print ("Para modelo Multinomial NB:  " +'alpha = %f accuracy = %f' %(alpha, np.mean((y_test-y_pred)==0)))
tb = time()
print ("Tempo total para treinar e prever o modelo Multinomial: " + str(tb-ta))
print ('--------------------------------------------------------------------------------------')


### Esta célula pode levar horas para ser executada!

In [None]:
# Multionomial Naive bayes: Alpha Vs accuracy 
acc = []
alp = []

for alpha in [float(j) / 100 for j in range(1, 101, 1)]:
    print ('--------------------------------------------------------------------------------------')
    ta = time()
    clf1 = MyMultinomialBayesClassifier(alpha)
    clf1.train(X_train,y_train)
    y_pred1 = clf1.predict(X_test)
    acc.append(np.mean((y_test-y_pred1)==0))
    alp.append(alpha)
    tb = time()
    print ("Tempo de Treinamento: " + str(tb-ta) + " acurácia, alpha é: " + str(np.mean((y_test-y_pred1)==0)) +","+str(alpha))
    #print ('alpha=%f accuracy = %f' %(alpha, np.mean((y_test-y_pred1)==0)))

# Plotting 
with PdfPages('multinomial.pdf') as pdf:
    pl.plot(alp,acc,marker = '.', linestyle = '-', color = 'r')
    pl.ylabel('Acurácia',color = 'g')
    pl.xlabel('Alpha',color = 'g')
    pl.title('Plot Alpha Vs Acurácia para Multinomial NB',color = 'r')
    pdf.savefig() 
    pl.close()

# Max Accuracy and Corresponding alpha.

print ("Acurácia máxima para o modelo Multinomial NB is: " + str(max(acc)))
print ("com valor correspondente alpha de:       " + str(alp[(acc.index(max(acc)))]))