# NAIVE BAYES
1. Crie um algoritmo “Naive Bayes” Multinomial em Python
2. Aplique-o sobre o dataset SMS Spam Collection
3. Compare o resultado com o gerado pelo sklearn
4. Considerando o que aprendeu até agora, aplique as técnicas cabíveis no
dataset escolhido.
5. Submeta no blackboard o link do notebook Google-Colab com a
solução do seu grupo

## Imports

In [97]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

## Download Dataset

In [98]:
# Set random seed
np.random.seed(0)

# Load data
spam = pd.read_csv('spam.csv',encoding='iso8859_14')
spam = spam.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
spam.columns=['target','text']

spam

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [99]:
# Vetorização
cv = CountVectorizer()
x = cv.fit_transform(spam['text']).toarray()
lb = LabelBinarizer()
y = lb.fit_transform(spam['target']).ravel()
print(x.shape,y.shape)


(5572, 8681) (5572,)


In [100]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

## Problema 1
### Implementação Naive Bayes Multinomial

In [101]:
class MultiNB:
    def __init__(self,alpha=1):
        self.alpha = alpha
    
    def _prior(self): # CHECKED

        P = np.zeros(self.n_classes_)
        _, self.dist = np.unique(self.y,return_counts=True)
        for i in range(self.classes_.shape[0]):
            P[i] = self.dist[i] / self.n_samples
        return P
            
    def fit(self, X, y): # CHECKED, matches with sklearn

        self.y = y
        self.n_samples, self.n_features = X.shape
        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        self.class_priors_ = self._prior()
        
        # distinct values in each features
        self.uniques = []
        for i in range(self.n_features):
            tmp = np.unique(X[:,i])
            self.uniques.append( tmp )
            
        self.N_yi = np.zeros((self.n_classes_, self.n_features)) # feature count
        self.N_y = np.zeros((self.n_classes_)) # total count 
        for i in self.classes_: # x axis
            indices = np.argwhere(self.y==i).flatten()
            columnwise_sum = []
            for j in range(self.n_features): # y axis
                columnwise_sum.append(np.sum(X[indices,j]))
                
            self.N_yi[i] = columnwise_sum # 2d
            self.N_y[i] = np.sum(columnwise_sum) # 1d
            
    def _theta(self, x_i, i, h):
        
        Nyi = self.N_yi[h,i]
        Ny  = self.N_y[h]
        
        numerator = Nyi + self.alpha
        denominator = Ny + (self.alpha * self.n_features)
        
        return  (numerator / denominator)**x_i
    
    def _likelyhood(self, x, h):
        tmp = []
        for i in range(x.shape[0]):
            tmp.append(self._theta(x[i], i,h))
        
        return np.prod(tmp)
    
    def predict(self, X):
        samples, features = X.shape
        self.predict_proba = np.zeros((samples,self.n_classes_))
        
        for i in range(X.shape[0]):
            joint_likelyhood = np.zeros((self.n_classes_))
            
            for h in range(self.n_classes_):
                joint_likelyhood[h]  = self.class_priors_[h] * self._likelyhood(X[i],h) # P(y) P(X|y) 
                
            denominator = np.sum(joint_likelyhood)
            
            for h in range(self.n_classes_):
                numerator = joint_likelyhood[h]
                self.predict_proba[i,h] = (numerator / denominator)
            
        indices = np.argmax(self.predict_proba,axis=1)
        return self.classes_[indices]

In [102]:
def pipeline(X,y,X_test, y_test, alpha):

    # SKLEARN
    print("-"*20,'Sklearn',"-"*20)
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X,y)
    sk_y = clf.predict(X_test)
    print("Feature Count \n",clf.feature_count_)
    print("Class Log Prior ",clf.class_log_prior_)
    print('Accuracy ',accuracy_score(y_test, sk_y),sk_y)
    print(clf.predict_proba(X_test))

    # CUSTOM
    print("-"*20,'Custom',"-"*20)
    nb = MultiNB(alpha=alpha)
    nb.fit(X,y)
    yhat = nb.predict(X_test)
    me_score = accuracy_score(y_test, yhat)
    print("Feature Count\n",nb.N_yi)
    print("Class Log Prior ",np.log(nb.class_priors_))
    print('Accuracy ',me_score,yhat)
    print(nb.predict_proba) # my predict proba is only for last test set

## Problema 2
### Aplicação sobre o dataset

In [103]:
pipeline(x,y,x,y, alpha=10)

-------------------- Sklearn --------------------
Feature Count 
 [[ 0.  0.  1. ...  1.  0.  1.]
 [10. 29.  0. ...  0.  1.  0.]]
Class Log Prior  [-0.14394332 -2.00944415]
Accuracy  0.9673366834170855 [0 0 1 ... 0 0 0]
[[9.99999578e-01 4.22333641e-07]
 [9.98808467e-01 1.19153267e-03]
 [2.17166195e-08 9.99999978e-01]
 ...
 [9.99996595e-01 3.40498406e-06]
 [1.00000000e+00 5.70994808e-12]
 [9.99539376e-01 4.60623978e-04]]
-------------------- Custom --------------------
Feature Count
 [[ 0.  0.  1. ...  1.  0.  1.]
 [10. 29.  0. ...  0.  1.  0.]]
Class Log Prior  [-0.14394332 -2.00944415]
Accuracy  0.9673366834170855 [0 0 1 ... 0 0 0]
[[9.99999578e-01 4.22333641e-07]
 [9.98808467e-01 1.19153267e-03]
 [2.17166195e-08 9.99999978e-01]
 ...
 [9.99996595e-01 3.40498406e-06]
 [1.00000000e+00 5.70994808e-12]
 [9.99539376e-01 4.60623978e-04]]


  self.predict_proba[i,h] = (numerator / denominator)


## Problema 3
### Comparação de resultados
Tal como podemos ver no problema 2 o nível de percisão atingido na implementação do Naive Bayes do sklearn é semelhante ao nível de percisão atingido com a implementação manual de Naive Bayes. 
- Sklearn: 0.967
- Manual : 0.967

## Problema 4
### Processamento do dataset escolhido