Baseado em 

* https://gist.github.com/billy-yuan/0fe85405cb24c61ada5516165206b9f3
* http://www.nltk.org/book/ch06.html

In [20]:
from collections import Counter
import pandas as pd
import nltk as nltk
import numpy as np
from sklearn.model_selection import train_test_split
import string

# Tratamento arquivo

In [2]:
chennai = pd.read_csv('chennai.csv', sep=';')
chennai.head()

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Sentiment,Rating_Percentage
0,Accord Metropolitan,Excellent comfortableness during stay,Its really nice place to stay especially for b...,3,100
1,Accord Metropolitan,Not too comfortable,It seems that hotel does not check the basic a...,1,20
2,Accord Metropolitan,,Worst hotel I have ever encountered. I will ne...,1,20
3,Accord Metropolitan,Best hotel,Had a good time in this hotel and the staff Ku...,3,100
4,Accord Metropolitan,,good hotel and staff Veg food good non veg bre...,3,100


## Classes

### Preparador para treino

In [48]:
class NaiveBayesPreparator(object):
    
    def __init__(self, sentiments, descriptions):
        self._translator = str.maketrans('', '', string.punctuation)

        self.sentiments = sentiments
        self.descriptions = descriptions
        
        self.features = list(zip(self.count_words(descriptions), sentiments))

    def count_words(self, descriptions):
        '''
        :return Dict of used words with the total size used
        '''
        return [dict(Counter(self.decode(description))) for description in descriptions]

    def decode(self, text):
        '''
        Given a text, removes your punctuation and split the words
        '''
        text = text.translate(self._translator)
        return nltk.word_tokenize(text)
    
        #is_adjective = lambda tag: tag[:2] == 'JJ'
        #return [word for (word, tag) in nltk.pos_tag(text) if is_adjective(tag)]

    def split(self, test_size=0.2, random_state=None):
        '''
        Split data into train and test data
        '''
        return train_test_split(self.features, test_size=test_size, random_state=random_state)

### Utilitário para medir log

In [40]:
from collections import defaultdict

class Accuracy(object):
    
    @staticmethod
    def mensurate(classifier, training_set, test_set):
        print("TRAIN SET")
        print('Train base', nltk.classify.accuracy(classifier, train_set))
        Accuracy.mensurate_by_class(classifier, train_set)
        
        print()
        
        print("TEST SET")
        print('Test base', nltk.classify.accuracy(classifier, test_set))
        Accuracy.mensurate_by_class(classifier, test_set)

    @staticmethod
    def mensurate_by_class(classifier, elements):
        d = defaultdict(list)

        for test in elements:
            d[test[1]].append(test)

        for key in d.keys():
            print(' - Class', key, ':', nltk.classify.accuracy(classifier, d[key]))

## Execução

In [49]:
descriptions = chennai['Review_Text']
sentiments = chennai['Sentiment']

train_set, test_set = NaiveBayesPreparator(sentiments, descriptions).split(random_state=128)

classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features()

Most Informative Features
                   Worst = 1                   1 : 3      =     39.3 : 1.0
                  Shower = 1                   1 : 3      =     31.5 : 1.0
                    even = 2                   1 : 3      =     31.4 : 1.0
                      no = 3                   1 : 3      =     31.4 : 1.0
                   phone = 1                   1 : 3      =     28.5 : 1.0
                   worst = 1                   1 : 3      =     28.4 : 1.0
                   badly = 1                   1 : 3      =     27.6 : 1.0
                 wouldnt = 1                   1 : 3      =     26.7 : 1.0
                  linens = 1                   1 : 3      =     26.7 : 1.0
                 privacy = 1                   1 : 3      =     26.7 : 1.0


In [50]:
Accuracy.mensurate(classifier, train_set, test_set)

TRAIN SET
Train base 0.86392239119
 - Class 1 : 0.936842105263
 - Class 2 : 0.899096385542
 - Class 3 : 0.845487364621

TEST SET
Test base 0.759958071279
 - Class 1 : 0.728971962617
 - Class 2 : 0.627118644068
 - Class 3 : 0.8


## Outros métodos

### Bag of words

In [55]:
class NaiveBayesPreparatorBagOfWords(NaiveBayesPreparator):

    def __init__(self, sentiments, descriptions):
        self._all_words = self._calcule_all_words(descriptions)

        super(NaiveBayesPreparatorBagOfWords, self).__init__(sentiments, descriptions)
        
    def _calcule_all_words(self, descriptions):
        all_words = set()

        for description in descriptions:
            all_words |= set(decode_string(description))

        return all_words

    def count_words(self, descriptions):
        def make(words):
            counter = Counter(words)
            return {k: counter[k] if k in words else 0 for k in self._all_words}

        return [make(decode_string(description)) for description in descriptions]

# --------------------

train_set, test_set = NaiveBayesPreparatorBagOfWords(sentiments, descriptions).split(random_state=128)

classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features()

Most Informative Features
              unpleasant = 1                   1 : 3      =     26.7 : 1.0
                   dirty = 1                   1 : 3      =     25.1 : 1.0
                   worst = 1                   1 : 3      =     25.1 : 1.0
                 cramped = 1                   1 : 3      =     21.8 : 1.0
                    star = 1                   1 : 3      =     21.8 : 1.0
                terrible = 1                   1 : 3      =     21.8 : 1.0
            disappointed = 1                   1 : 3      =     21.8 : 1.0
                pathetic = 1                   1 : 3      =     21.8 : 1.0
                    torn = 1                   1 : 3      =     21.8 : 1.0
                  hectic = 1                   1 : 3      =     17.0 : 1.0


In [56]:
Accuracy.mensurate(classifier, train_set, test_set)

TRAIN SET
Train base 0.842160461458
 - Class 1 : 0.565789473684
 - Class 2 : 0.48343373494
 - Class 3 : 0.966064981949

TEST SET
Test base 0.759958071279
 - Class 1 : 0.336448598131
 - Class 2 : 0.310734463277
 - Class 3 : 0.946268656716
