In [2]:
import numpy as np
import pandas as pd
from math import log
import nltk

In [1]:
class NaiveBayesClassificator:
    
    def __init__(self, n_gram=1, stem=True, stop_words=True, alpha=1, class_prob=None):
        self.n_gram = n_gram
        self.stem = stem
        self.stop_words = stop_words
        self.alpha = alpha
        self.class_prob = class_prob
        self.stemmer = nltk.stem.RSLPStemmer()
        self.stop_words_list = nltk.corpus.stopwords.words('portuguese')
    
    def _clean_sentence(self, sentence):
        
        string = str(sentence)
    
        # Cleaning unwanted characters on the sentence
        string = string.replace(":", " ")
        string = string.replace(";", " ")
        string = string.replace(",", " ")
        string = string.replace("?", " ")
        string = string.replace("(", " ")
        string = string.replace(")", " ")
        string = string.replace("\n", " ")
        string = string.replace("'", " ")
        string = string.replace(".", " ")
        string = string.replace('"', " ")
        string = string.replace("!", " ")
        string = string.replace("@", " ")
        string = string.lower()

        # Converting the sentence into a list of words
        string_list = []
        
        for word in string.split():
            if self.stop_words:
                if word not in self.stop_words_list:
                    if self.stem:
                        string_list.append(self.stemmer.stem(word))
                    else:
                        string_list.append(word)
            else:
                if self.stem:
                    string_list.append(self.stemmer.stem(word))
                else:
                    string_list.append(word)
                    
        
                
        return self._create_gram(string_list)

    def _create_gram(self, words_list):
        bigram = []
        for n in range(len(words_list) + 1 - self.n_gram):
            bigram.append(" ".join(words_list[n:n+self.n_gram]))
        return bigram

    def _create_dict(self, sentences_series):

        self.count = {}

        for sentence in sentences_series:
            words_list = self._clean_sentence(sentence)
            for word in words_list:
                if word in self.count:
                    self.count[word] += 1
                else:
                    self.count[word] = 1
                        
        return self.count
    
    def _get_d(self, df, x_label):
        
        words = []
        for sentence in df[x_label]:
            for word in self._clean_sentence(sentence):
                if word not in words:
                    words.append(word)
                    
        return len(words)
    
    def _calc_prob(self, sentence, e):
        
        prob = log(self.classes_dicts[e]["class_prob"])
    
        # Alpha factor for the LaPlace smoothing
        total = self.classes_dicts[e]["n_words"] + self.alpha*self.d
        
        for word in self._clean_sentence(sentence):
            if word in self.classes_dicts[e]["words"]:
                count = self.classes_dicts[e]["words"][word] + self.alpha
            else:
                count = self.alpha
            prob += log(count/total)
        
        return prob
    
    def _classify(self, sentence):
        
        highest = [None, None]
        
        for e in self.classes:
            classes_probs = self._calc_prob(sentence, e)
            if highest[0] is not None:
                if classes_probs > highest[1]:
                    highest[0] = e
                    highest[1] = classes_probs
            else:
                highest[0] = e
                highest[1] = classes_probs
                
        return highest[0]            
    
    def fit(self, df, x_label, y_label):
        
        self.df = df
        self.x_label = x_label
        self.y_label = y_label
        
        self.classes = []
        for e in df[y_label]:
            if e not in self.classes:
                self.classes.append(e)
                
        self.classes_dicts = {}
        
        for e in self.classes:
            self.classes_dicts[e] = {}
            self.classes_dicts[e]["words"] = self._create_dict(df[df[y_label] == e][x_label])
            self.classes_dicts[e]["n_words"] = len(self.classes_dicts[e]["words"])
            if self.class_prob == None:
                self.classes_dicts[e]["class_prob"] = df[df[y_label] == e][x_label].count()/df[x_label].count()
            elif self.class_prob == "equal":
                self.classes_dicts[e]["class_prob"] = 1/len(self.classes)
            
        self.d = self._get_d(df, x_label)
            
    def predict(self, sentence_series):
        
        predictions = []
        
        for sentence in sentence_series:
            predictions.append(self._classify(sentence))
            
        return pd.Series(predictions)
    
    def evaluate(self, y_test, y_pred):
        
        count = 0
        
        for e in range(len(y_test)):
            if y_test.loc[e] == y_pred.loc[e]:
                count += 1
                
        performance = count/(y_test.count())
        return (performance, count)
    
    def confusion_matrix(self, y_test, y_pred):
        
        n = [[0] * len(self.classes)] * len(self.classes)
        cm = np.array(n)
        
        for e in range(len(y_test)):
            cls = y_test.loc[e]
            pred = y_pred.loc[e]
            cm[cls][pred] += 1
            
        return cm
        
        