## Classificação de Revisões do IMDb com Keras

In [None]:
from keras.datasets import imdb
from keras import preprocessing
import numpy as np
import pandas as pd

### Leitura dos dados

In [None]:
df = pd.read_csv('movie_data.csv.gz', encoding='utf-8')

In [None]:
df.head()

In [None]:
samples = df["review"].values

In [None]:
dimensionality = 1000 #dimensão do vetor quer vai representar a palavra

### Constrói o índice de palavras

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=1000) 
tokenizer.fit_on_texts(samples) #constroi o índice de palavras

In [None]:
word_index = tokenizer.word_index
print('Foram encontrados %s tokens.' % len(word_index))

### Transforma strings em lista de índices inteiros

In [None]:
sequences = tokenizer.texts_to_sequences(samples) #transforma o texto em sequencias de índices 

In [None]:
sequences[0][:10] #os 10 primeiros índices da frase 0

### Pre-processa sequencias para padronizar o tamanho

In [None]:
maxlen = 200
sequences_padding = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
len(sequences[10])

In [None]:
len(sequences_padding[10])

### Usando a camada Embedding e classificando os dados do IMDB

### SimpleRNN

#### Construindo o modelo

In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Embedding, Dense, Input

In [None]:
original_dim = 10000 #numero de palavra para considerar como feature
new_dim = 32

In [None]:
model = Sequential()
model.add(Embedding(input_dim=dimensionality,input_length=maxlen,output_dim=new_dim))
model.add(SimpleRNN(new_dim, input_shape=(new_dim)))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.summary()

#### Compilando o modelo

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

#### Dividindo os dados em treino e teste

In [None]:
import random 

In [None]:
size = len(sequences_padding)
indices = np.arange(size)
random.shuffle(indices)

In [None]:
indices

In [None]:
x = sequences_padding[indices]
y = df.sentiment.values[indices]

In [None]:
x

In [None]:
y

In [None]:
treino = 0.8

x_treino = x[:int(treino*size),:]
y_treino = y[:int(treino*size)]
x_teste = x[int(treino*size):]
y_teste = y[int(treino*size):]

In [None]:
y_teste.shape

#### Treinando o modelo

In [None]:
history = model.fit(x_treino, y_treino, epochs=10, batch_size=256, validation_split=0.2)

#### Avaliando o modelo

In [None]:
evaluation = model.evaluate(x_teste,y_teste)

#### Visualizando resultados

In [None]:
import  matplotlib.pyplot as plt

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['treino', 'validação'], loc='upper left')
plt.show()

In [None]:
evaluation

### Modelo LSTM

In [None]:
from keras.layers import LSTM, Dense, Masking, Embedding

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=dimensionality,input_length=maxlen,output_dim=new_dim))

# Recurrent layer
model.add(LSTM(new_dim, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(new_dim, activation='relu'))       


# Output layer
model.add(Dense(1, activation='sigmoid'))


In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [None]:
history = model.fit(x_treino, y_treino, epochs=10, batch_size=256, validation_split=0.2)

In [None]:
evaluation = model.evaluate(x_teste,y_teste)

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')

plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['treino', 'validação'], loc='upper left')
plt.show()

In [None]:
evaluation