In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod
from nltk.corpus import words
import re
import math

In [2]:
class Unit(ABC):
    
    '''
    # Método para computar a unidade de delay, e este valor
    # é compartilhado com os outros neuronios e também para 
    # computar o y na camada de saída
    # param: 
    #   x_t = x na posição t da sequência
    #   a_t_minus_1 = valor da unidade de delay anterior
    #   weigths = vetor contendo os pesos de x, a e do bias
    # return:
    #   Valor da promixa unidade de delay
    '''
    @abstractmethod
    def __handle__(self, x_t, a_t_minus_1):
        return NotImplemented
    

class UnitSimple(Unit):
    
    __slots__ = ['u', 'output', 'type_of_layer']
    
    def __init__(self, type_of_layer):
        self.output = 0
        self.u = 0
        self.type_of_layer = type_of_layer
    
    def __handle__(self, x_t, a_t_minus_1, weigths, fuction):
        if self.type_of_layer in 'hidden':
            part_one = np.dot(x_t, weigths[0, : -3].T)
            part_two = np.dot(a_t_minus_1, weigths[0][-2])
            part_three = np.dot(-1, weigths[0][-3])
            self.u = part_one + part_two + part_three 
            self.output = self.function_activation(fuction, self.u)
        else:
            self.u = weigths[0][-1] * x_t
            self.output = 1 if self.function_activation(fuction, self.u) >= 0.5 else 0
    
    def function_activation(self, function, u):
        if function in 'h':
            return (math.exp(u)-math.exp(-u))/(math.exp(u) + math.exp(-u))
        elif function in 'l':
            return (1)/(1 + math.exp(-u))
        else:
            return 1
        
    def diff_function(self, function, u):
        if function in 'h':
            return (math.exp(u)-math.exp(-u))/(math.exp(u) + math.exp(-u))
        elif function in 'l':
            return u * (1 - u)
        else:
            return 1

In [3]:
class HiddenLayer:
    
    __slots__ = ['num_of_neurons', 'neurons', 'type_of_neurons', 'function_ativation', 'units_delay']
    
    def __init__(self, num_of_neurons, type_of_neurons='n', function_ativation='l'):
        self.num_of_neurons = num_of_neurons
        self.type_of_neurons = type_of_neurons
        self.function_ativation = function_ativation
        self.units_delay = np.zeros((num_of_neurons + 1, 1))
        
        if type_of_neurons in 'lstm':
            self.neurons = [LSTM() for i in range(num_of_neurons)]
        elif type_of_neurons in 'gru':
            self.neurons = [GRU() for i in range(num_of_neurons)]
        else:
            self.neurons = [UnitSimple('hidden') for i in range(num_of_neurons)]
    
    def activate_neurons(self, x_t, index, weigths):
        a_t_minus_1 = self.units_delay[index]
        self.neurons[index].__handle__(x_t, a_t_minus_1, weigths, self.function_ativation)
        self.units_delay[index+1] = self.neurons[index].output


In [4]:
class OutputLayer:
    
    __slots__ = ['num_of_neurons', 'neurons', 'function_activation']
    
    def __init__(self, num_of_neurons, function_activation='l'):
        self.num_of_neurons = num_of_neurons
        self.function_activation = function_activation
        self.neurons = [UnitSimple('output') for i in range(num_of_neurons)]
    
    def activate_neuron(self, unit_delay, index, weigths):
        self.neurons[index].__handle__(unit_delay, None, weigths, self.function_activation)
        return self.neurons[index].output

In [23]:
class RNNSimple:
    
    __slots__ = ['weigths']        
    
    def __init__(self):
        # São 3 pesos: um para a unidade de delay, um para a entrada e outro para o bias
        self.weigths = []       
    
    def train(self, input_set, label_set, epocks=5, rate_learning = 0.1, fun_activations=['h','l']):
        # Parte 1 - Inicialização da rede 
        dim_of_input = len(input_set[0][0])
        dim_of_weigth = dim_of_input + 3 # + 2 significa os pesos da unidade de delay, o bias e  do y
        self.weigths = np.random.random((1, dim_of_weigth ))
        
        for epock in range(epocks):
            ''' Implementar depois 
            random_index = np.random.choice(input_set.index, len(input_set), replace=False)
            x_set = input_set.loc[random_index] # Embaralhando o conjuto de treino
            y_set = label_set.loc[random_index] # Embaralhando o conjuto de rótulos
            '''
            label_set.index = range(len(input_set))
            
            for count in range(len(input_set)):
                # Parte 2 - Feedforward
                sequence_data = np.array(input_set[count]) # Obtendo a frase
                label = label_set.loc[count] # Obtendo a sainda
                length_sequence = len(sequence_data)
                hidden_layer = HiddenLayer(length_sequence)
                output_layer = OutputLayer(length_sequence)
                
                y_predicted = 0
                unit_delay = None
                diff_u_output = 0
                for i in range(0,length_sequence):
                    data = sequence_data[i]
                    hidden_layer.activate_neurons(data, i, self.weigths)
                    if i == length_sequence - 1:
                        unit_delay = hidden_layer.units_delay[i+1]
                        y_predicted = output_layer.activate_neuron(unit_delay, i, self.weigths)
                        u = output_layer.neurons[i].u
                        diff_u_output = output_layer.neurons[i].diff_function('l',u)
                    
                # Parte 3 - Backforward
                loss = label - y_predicted
                delta = loss * diff_u_output * unit_delay 
                self.weigths[0][-1] += rate_learning * delta 
                
                for i in range(length_sequence-1, -1, -1): # De tráz pra frente
                    hidden_neuron = hidden_layer.neurons[i] 
                    diff_u_hidden = hidden_neuron.diff_function('l',hidden_neuron.u)
                    delta_a = loss * diff_u_output * diff_u_hidden * hidden_layer.units_delay[i]
                    delta_x = loss * diff_u_output * diff_u_hidden * sequence_data[i]
                    delta_b = loss * diff_u_output * diff_u_hidden * np.array([-1])
                    self.weigths[0][-2] += rate_learning * delta_a
                    self.weigths[0][-3] += rate_learning * delta_b
                    self.weigths[0, :-3] += rate_learning * delta_x 
        
    def predict(self):
        pass

In [6]:
def train_test(data, class_d):
    train_x, test_x = _split_data_(data)
    train_d = class_d.iloc[train_x.index]
    test_d = class_d.drop(train_d.index)
    return train_x,test_x,train_d,test_d

def _split_data_(data):
    index_random = _random_index_(data,0.8)
    return data.loc[index_random],data.drop(index_random)

def _random_index_(data,count):
    random_count = int(len(data) * count)
    return np.random.choice(data.index,random_count,replace = False)


In [7]:
def class_encode(specie,column,values_y):
    if column in specie:
        return values_y[1]
    else:
        return values_y[0]

In [8]:
# Carregar dados 
dataset = pd.read_csv('chennai_reviews_edited.csv')
dataset.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
dataset

Unnamed: 0,Review_Text,Sentiment
0,Its really nice place to stay especially for b...,3
1,It seems that hotel does not check the basic a...,1
2,Worst hotel I have ever encountered. I will ne...,1
3,Had a good time in this hotel and the staff Ku...,3
4,good hotel and staff Veg food good non veg bre...,3
...,...,...
4762,My fifth stay at the hotel for business. Rooms...,3
4763,enjoyable,3
4764,Most impressive service by staff in all areas....,3
4765,"The linens were smelling bad, and the elevator...",1


In [10]:
dataset_class = dataset.Sentiment 
dataset_input = dataset.drop(['Sentiment'], axis=1)

In [11]:
NUM_OF_SAMPLES = 30
random_index = np.random.choice(dataset_input.index, NUM_OF_SAMPLES, replace=False)
label_set = dataset_class.loc[random_index]
input_set = dataset_input.loc[random_index]
input_set

Unnamed: 0,Review_Text
3731,"A++ friendly staff, direct access to the avenu..."
1325,"The room is very good, and the service is also..."
2179,A. Comfortable clean stay
3214,Ambience is good. Excellent location. Service ...
3634,ITC Grand Chola - A Luxury Collection Hotel. W...
2824,"This hotel was the worst I have ever been to, ..."
4074,Hotel staff was friendly and courteous.\nRoom ...
1469,It is on a good location and the staff are cou...
1362,Wasome experience and it was very good hotel ....
1996,Excellent location for my appointments in buil...


In [12]:
# Pré-processemanto dos dados
vocabulary = words.words()
vocabulary.append('<UNQ>')
vocabulary.append('<EOF>')

input_matrix = []
for index, row in input_set.iterrows():
    sentence = row[0].replace(',','').replace('.','') + ' <EOF>'
    words_in_sentences = sentence.split(' ')
    input_sequence = []
    for word in words_in_sentences:
        x = np.zeros(len(vocabulary) + 1)
        if word in vocabulary:
            x[vocabulary.index(word)] = 1
        elif '<EOF>':
            x[-1] = 1
        else:
            x[-2] = 1
        input_sequence.append(x)
    input_matrix.append(input_sequence)

In [13]:
input_matrix[0]

[array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 1.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 

In [14]:
label_matrix = label_set.apply(class_encode, column='3',values_y=[0,1])
label_matrix

3731    1
1325    1
2179    1
3214    1
3634    1
2824    0
4074    1
1469    0
1362    1
1996    1
548     1
2456    0
4247    1
714     1
4705    1
1252    0
2846    0
1003    0
3291    1
3843    0
281     0
871     1
210     1
919     1
3074    1
1439    1
543     1
1677    0
330     1
3131    0
Name: Sentiment, dtype: int64

In [24]:
rnn = RNNSimple()
rnn.train(input_matrix, label_matrix)

1
1
[[0.85493007 0.45914261 0.93783683 ... 0.54413464 0.80275802 0.77342257]]

1
1
[[0.85493007 0.45914261 0.93783683 ... 0.54413464 0.80275802 0.77342257]]

1
1
[[0.85493007 0.45914261 0.93783683 ... 0.54413464 0.80275802 0.77342257]]

1
1
[[0.85493007 0.45914261 0.93783683 ... 0.54413464 0.80275802 0.77342257]]

1
1
[[0.85493007 0.45914261 0.93783683 ... 0.54413464 0.80275802 0.77342257]]

0
1
[[0.85493007 0.45345236 0.93783683 ... 0.74990038 0.67815014 0.75640683]]

1
1
[[0.85493007 0.45345236 0.93783683 ... 0.74990038 0.67815014 0.75640683]]

0
1
[[0.85493007 0.45277945 0.93783683 ... 0.83722051 0.62660823 0.74244497]]

1
1
[[0.85493007 0.45277945 0.93783683 ... 0.83722051 0.62660823 0.74244497]]

1
1
[[0.85493007 0.45277945 0.93783683 ... 0.83722051 0.62660823 0.74244497]]

1
1
[[0.85493007 0.45277945 0.93783683 ... 0.83722051 0.62660823 0.74244497]]

0
1
[[0.85493007 0.45277945 0.93783683 ... 0.79752777 0.64006651 0.72854219]]

1
1
[[0.85493007 0.45277945 0.93783683 ... 0.7975277

KeyboardInterrupt: 