In [0]:
import os
import re
import string
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

import keras
import keras.backend as K
from keras import Input
from keras.layers import Concatenate
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Reshape
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import SimpleRNN
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.models import load_model
from keras import initializers, regularizers, optimizers
from keras.engine.topology import Layer
from keras import constraints
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from nltk.tokenize import RegexpTokenizer

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp "/content/drive/My Drive/data/reviews_rio_preprocessed.csv" "reviews_rio_preprocessed.csv"

In [0]:
!cp "/content/drive/My Drive/data/aclImdb_v1.tar.gz" "aclImdb_v1.tar.gz"

In [0]:
!cp "/content/drive/My Drive/data/sample_airbnb.csv" "sample_airbnb.csv"

In [0]:
!tar -xf aclImdb_v1.tar.gz

In [0]:
!ls

In [0]:
def clean_imdb(directory):
    '''
    Returns cleaned dataframe of IMDB reviews with columns ['review', 'sentiment']
    '''
    sentiment = {'neg': 0, 'pos': 1}
    df_columns = ['review', 'sentiment']
    reviews_with_sentiment = pd.DataFrame(columns = df_columns)
    for i in ('test', 'train'):
        for j in ('neg', 'pos'):
            file_path = directory + i + '/' + j
            for file in os.listdir(file_path):
                with open((file_path + '/' + file), 'r',
                          encoding = 'utf-8') as text_file:
                    text = text_file.read()
                review = pd.DataFrame([[text, sentiment[j]]],
                                      columns = df_columns)
                reviews_with_sentiment = reviews_with_sentiment.\
                                         append(review, ignore_index = True)
    return reviews_with_sentiment

cleaned_imdb = clean_imdb("aclImdb/")

In [0]:
cleaned_imdb.head()

In [0]:
len(cleaned_imdb)

In [0]:
def strip_punctuation_and_whitespace(reviews_df, verbose = True):
    '''
    Strips all punctuation and whitespace from reviews EXCEPT spaces (i.e. ' ')
    Removes "<br />"
    Returns dataframe of cleaned IMDB reviews
    '''
    trans_punc = str.maketrans(string.punctuation,
                               ' ' * len(string.punctuation))
    whitespace_except_space = string.whitespace.replace(' ', '')
    trans_white = str.maketrans(whitespace_except_space,
                                ' ' * len(whitespace_except_space))
    stripped_df = pd.DataFrame(columns = ['review', 'sentiment'])
    for i, row in enumerate(reviews_df.values):
        if i % 5000 == 0 and verbose == True:
            print('Stripping review: ' + str(i) + ' of ' + str(len(reviews_df)))
        if type(reviews_df) == pd.DataFrame:
            review = row[0]
            sentiment = row[1]
        elif type(reviews_df) == pd.Series:
            review = row
            sentiment = np.NaN
        try:
            review.replace('<br />', ' ')
            for trans in [trans_punc, trans_white]:
                review = ' '.join(str(review).translate(trans).split())
            combined_df = pd.DataFrame([[review, sentiment]],
                                       columns = ['review', 'sentiment'])
            stripped_df = pd.concat([stripped_df, combined_df],
                                    ignore_index = True)
        except AttributeError:
            continue
    return stripped_df

stripped_imdb = strip_punctuation_and_whitespace(cleaned_imdb)

In [0]:
stripped_imdb.iloc[200].values

In [0]:
stripped_imdb.head()

In [0]:
def get_length_all_reviews(sentences):
    '''
    Returns a list of length of all reviews
    Used for plotting histogram
    '''
    lengths = [len(i.split(' ')) for i in sentences]
    return lengths

imdb_lengths = get_length_all_reviews(stripped_imdb['review'])

In [0]:
max(imdb_lengths)

In [0]:
def plot_histogram(sentence_lengths, x_dim):
    '''
    Plots histogram of length of all sentences
    '''
    plt.hist(sentence_lengths, 50, [0, x_dim])
    plt.xlabel('Review length (words)')
    plt.ylabel('Frequency')
    plt.title('Review Lengths (Words per review)')
    plt.show()

plot_histogram(imdb_lengths, 2525)
plot_histogram(imdb_lengths, 1200)

In [0]:
def create_tokenizer(max_words_to_keep, words_review_df):
    '''
    Creates tokenizer
    Returns a tokenizer object and reviews converted to integers
    '''
    tokenizer = Tokenizer(num_words = max_words_to_keep,
                          lower = True,
                          split = ' ')
    tokenizer.fit_on_texts(words_review_df['review'].values)
    return tokenizer, \
           tokenizer.texts_to_sequences(words_review_df['review'].values)

imdb_sequence_length = 1000
vocabulary_length = 10000
tokenizer, integer_reviews = create_tokenizer(vocabulary_length, stripped_imdb)

In [0]:
print(len(integer_reviews[100]))
print(len(integer_reviews[200]))

In [0]:
def pad_zeros(encoded_reviews, padding_length, padding = 'pre'):
    '''
    Pads integer reviews either left ('pre') or right ('post')
    '''
    return pad_sequences(encoded_reviews,
                         maxlen = padding_length,
                         padding = padding)

padded_reviews = pad_zeros(integer_reviews, imdb_sequence_length, padding = 'pre')

In [0]:
print(len(padded_reviews[100]))
print(len(padded_reviews[200]))

In [0]:
split = 0.2
X_train, X_test, y_train, y_test = train_test_split(padded_reviews, stripped_imdb['sentiment'], test_size=split, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=split, random_state=1)

In [0]:
print("Treino: ", len(X_train))
print("Treino: ", len(X_val))
print("Teste: ", len(X_test))

### RNN

In [0]:
def create_rnn_model(vocabulary_length):

  model = Sequential()
  model.add(Embedding(vocabulary_length, 32))
  model.add(SimpleRNN(128))
  model.add(Dense(1, activation='sigmoid'))
  #optimizer = getattr(keras.optimizers, 'rmsprop')(lr = 0.001)
  model.compile(loss = 'binary_crossentropy',optimizer = 'Adam', metrics = ['accuracy'])
  
  print(model.summary())
  return model

In [0]:
rnn_model = create_rnn_model(vocabulary_length)

In [0]:
def train_model(model, X_train, y_train, X_test, y_test, X_val, y_val, batch_size, epochs) :

    # Fit the model
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size = batch_size, epochs = epochs, verbose = 1)
  
    # Final evaluation of the model
    score, acc = model.evaluate(X_test, y_test, batch_size = batch_size)

    print('Test score:', score)
    print('Test accuracy:', acc)
    return history, model

In [0]:
history, rnn_model = train_model(rnn_model, X_train, y_train, X_test, y_test, 512, 10)

In [0]:
def plot_accuracy(history):
  plt.plot(range(10), history.history['val_acc'], '--o')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.title('Validation Accuracy After {} Epochs'.format(10))

In [0]:
plot_accuracy(history)

In [0]:
def save_model(model, name):
  model.save(name)

In [0]:
def load_keras_model(path):
  model = load_model(path)
  return model

In [0]:
save_model(rnn_model, 'rnn_model_final.h5')

In [0]:
rnn_model = load_keras_model('/content/drive/My Drive/data/modelos/rnn_model.h5')

#### Evaluation

In [0]:
prediction_rnn_test = rnn_model.predict_classes(X_test)
prediction_rnn_test = prediction_rnn_test.reshape(10000, )

In [0]:
y_test_v = y_test.values
y_test_v = y_test.astype(int)

In [0]:
# Accuracy

accuracy_rnn_test = accuracy_score(y_test_v, prediction_rnn_test)
print('Accuracy: %.3f' % accuracy_rnn_test)

In [0]:
# Precision

precision_rnn_test = precision_score(y_test_v, prediction_rnn_test, average='binary')
print('Precision: %.3f' % precision_rnn_test)

In [0]:
# Recall

recall_rnn_test = recall_score(y_test_v, prediction_rnn_test, average='binary')
print('Recall: %.3f' % recall_rnn_test)

In [0]:
# F1-scores

score_rnn_test = f1_score(y_test_v, prediction_rnn_test, average='binary')
print('F-Measure: %.3f' % score_rnn_test)

## LSTM

In [0]:
def create_lstm_model(vocabulary_length):
    
    model = Sequential()
    model.add(Embedding(vocabulary_length, 32))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation = 'sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

In [0]:
lstm_model = create_lstm_model(vocabulary_length)

In [0]:
lstm_history, lstm_model = train_model(lstm_model, X_train, y_train, X_test, y_test, X_val, y_val, 1000, 10)

In [0]:
plot_accuracy(lstm_history)

In [0]:
save_model(lstm_model, 'lstm_model.h5')

In [0]:
lstm_model = load_keras_model('/content/drive/My Drive/data/modelos/lstm_model.h5')

#### Evaluation

In [0]:
prediction_lstm_test = lstm_model.predict_classes(X_test)
prediction_lstm_test = prediction_lstm_test.reshape(10000, )

In [0]:
# Accuracy

accuracy_lstm_test = accuracy_score(y_test_v, prediction_lstm_test)
print('Accuracy: %.3f' % accuracy_lstm_test)

In [0]:
# Precision

precision_lstm_test = precision_score(y_test_v, prediction_lstm_test, average='binary')
print('Precision: %.3f' % precision_lstm_test)

In [0]:
# Recall

recall_lstm_test = recall_score(y_test_v, prediction_lstm_test, average='binary')
print('Recall: %.3f' % recall_lstm_test)

In [0]:
# F1-scores

score_lstm_test = f1_score(y_test_v, prediction_lstm_test, average='binary')
print('F-Measure: %.3f' % score_lstm_test)

## LSTM Attention

In [0]:
from google.colab import files
src = list(files.upload().values())[0]
open('layers.py','wb').write(src)
from layers import AttentionWithContext, Addition

In [0]:
def create_lstm_attention_model(vocabulary_length, hidden_units, num_layers, is_attention, is_bidirectional):
	num_classes = 2
	
	adam = optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.01)
	
	model = Sequential()
	model.add(Embedding(vocabulary_length, 32))

	for i in range(num_layers):
		return_sequences = is_attention or (num_layers > 1 and i < num_layers-1)

		if is_bidirectional:
			model.add(Bidirectional(LSTM(hidden_units, return_sequences=return_sequences, dropout=0.2, kernel_initializer=initializers.glorot_normal(seed=777), bias_initializer='zeros')))
		else:
			model.add(LSTM(hidden_units, return_sequences=return_sequences, dropout=0.2, kernel_initializer=initializers.glorot_normal(seed=777), bias_initializer='zeros'))
		
		if is_attention:
			model.add(AttentionWithContext())
			model.add(Addition())

	model.add(Dense(1, activation='sigmoid', kernel_initializer=initializers.glorot_normal(seed=777), bias_initializer='zeros'))
	model.compile(loss='binary_crossentropy',optimizer=adam,metrics=["accuracy"])
	model.summary()

	return model

In [0]:
lstm_attention_model = create_lstm_attention_model(vocabulary_length, 128, 1, True, True)

In [0]:
lstm_attention_history, lstm_attention_model = train_model(lstm_attention_model, X_train, y_train, X_test, y_test, X_val, y_val, 128, 10)

In [0]:
plot_accuracy(lstm_attention_history)

#### Evaluation

In [0]:
prediction_lstm_attention_test = lstm_attention_model.predict_classes(X_test)
prediction_lstm_attention_test = prediction_lstm_attention_test.reshape(10000, )

In [0]:
# Accuracy

accuracy_lstm_attention_test = accuracy_score(y_test_v, prediction_lstm_attention_test)
print('Accuracy: %.3f' % accuracy_lstm_attention_test)\

In [0]:
# Precision

precision_lstm_attention_test = precision_score(y_test_v, prediction_lstm_attention_test, average='binary')
print('Precision: %.3f' % precision_lstm_attention_test)

In [0]:
# Recall

recall_lstm_attention_test = recall_score(y_test_v, prediction_lstm_attention_test, average='binary')
print('Recall: %.3f' % recall_lstm_attention_test)

In [0]:
# F1-scores

score_lstm_attention_test = f1_score(y_test_v, prediction_lstm_attention_test, average='binary')
print('F-Measure: %.3f' % score_lstm_attention_test)

## Reviews Airbnb

In [0]:
def load_reviews_airbnb(path):
  reviews_airbnb = pd.read_csv(path)
  #ingles = reviews_airbnb[reviews_airbnb['language'] == 'en']
  reviews_airbnb = reviews_airbnb[['comments']]
  reviews_airbnb['sentiment'] = None
  reviews_airbnb.columns = ['review', 'sentiment']

  stripped_airbnb = strip_punctuation_and_whitespace(reviews_airbnb, verbose = False)
  airbnb_lengths = get_length_all_reviews(stripped_airbnb['review'])
  plot_histogram(airbnb_lengths, 400)
  airbnb_sequence_length = 250
  airbnb_tokenizer, airbnb_integer_reviews = create_tokenizer(vocabulary_length,stripped_airbnb)
  print(len(airbnb_integer_reviews[100]))
  print(len(airbnb_integer_reviews[200]))
  airbnb_padded_reviews = pad_zeros(airbnb_integer_reviews, airbnb_sequence_length, padding = 'pre')
  return stripped_airbnb, airbnb_padded_reviews

In [0]:
def load_airbnb_sample(path):
  airbnb_sample = pd.read_csv(path)
  airbnb_sample.drop(['Unnamed: 0'], axis=1, inplace=True)
  airbnb_sample = airbnb_sample[['comments', 'sentiment']]
  airbnb_sample.columns = ['review', 'sentiment']

  stripped_airbnb = strip_punctuation_and_whitespace(airbnb_sample, verbose = False)
  airbnb_lengths = get_length_all_reviews(stripped_airbnb['review'])
  plot_histogram(airbnb_lengths, 400)
  airbnb_sequence_length = 150
  airbnb_tokenizer, airbnb_integer_reviews = create_tokenizer(vocabulary_length,stripped_airbnb)
  print(len(airbnb_integer_reviews[100]))
  print(len(airbnb_integer_reviews[200]))
  airbnb_padded_reviews = pad_zeros(airbnb_integer_reviews, airbnb_sequence_length, padding = 'pre')
  return stripped_airbnb, airbnb_padded_reviews

In [0]:
stripped_airbnb, airbnb_padded_reviews = load_reviews_airbnb('reviews_rio_preprocessed.csv')

In [0]:
stripped_sample_airbnb, airbnb_padded_sample = load_airbnb_sample('sample_airbnb.csv')
y_test_sample = stripped_sample_airbnb['sentiment'].values
y_test_sample = y_test_sample.astype(int)

## RNN Prediction

#### População

In [0]:
prediction_rnn = rnn_model.predict_classes(airbnb_padded_reviews)

In [0]:
unique, counts = np.unique(prediction_rnn, return_counts=True)
dict(zip(unique, counts))

#### Amostra

In [0]:
prediction_rnn_sample = rnn_model.predict_classes(airbnb_padded_sample)

In [0]:
unique, counts = np.unique(prediction_rnn_sample, return_counts=True)
dict(zip(unique, counts))

In [0]:
prediction_rnn_sample_r = prediction_rnn_sample.reshape(382, )
matrix_rnn = confusion_matrix(y_test_sample, prediction_rnn_sample_r)

In [0]:
matrix_rnn

In [0]:
df_matrix_rnn = pd.DataFrame(matrix_rnn, index = ['Negativo', 'Positivo'], columns= ['Negativo', 'Positivo'])
plt.figure(figsize=(8,7))
sn.set(font_scale=1.4)
sn.heatmap(df_matrix_rnn, annot=True,  fmt='g', annot_kws={"size": 16})

In [0]:
# Accuracy

accuracy_rnn = accuracy_score(y_test_sample, prediction_rnn_sample_r)
print('Accuracy: %.3f' % accuracy_rnn)

In [0]:
# Precision

precision_rnn = precision_score(y_test_sample, prediction_rnn_sample_r, average='binary')
print('Precision: %.3f' % precision_rnn)

In [0]:
# Recall

recall_rnn = recall_score(y_test_sample, prediction_rnn_sample_r, average='binary')
print('Recall: %.3f' % recall_rnn)

In [0]:
# F1-scores

score_rnn = f1_score(y_test_sample, prediction_rnn_sample_r, average='binary')
print('F-Measure: %.3f' % score_rnn)

## LSTM Prediction

#### População

In [0]:
prediction_lstm = lstm_model.predict_classes(airbnb_padded_reviews)

In [0]:
unique_lstm, counts_lstm = np.unique(prediction_lstm, return_counts=True)
dict(zip(unique_lstm, counts_lstm))

#### Amostra

In [0]:
prediction_lstm_sample = lstm_model.predict_classes(airbnb_padded_sample)

In [0]:
unique, counts = np.unique(prediction_lstm_sample, return_counts=True)
dict(zip(unique, counts))

In [0]:
prediction_lstm_sample_r = prediction_lstm_sample.reshape(382, )
matrix_lstm = confusion_matrix(y_test_sample, prediction_lstm_sample_r)

In [0]:
df_matrix_lstm = pd.DataFrame(matrix_lstm, index = ['Negativo', 'Positivo'], columns= ['Negativo', 'Positivo'])
plt.figure(figsize=(8,7))
sn.set(font_scale=1.4)
sn.heatmap(df_matrix_lstm, annot=True,  fmt='g', annot_kws={"size": 16})

In [0]:
# Accuracy

accuracy_lstm = accuracy_score(y_test_sample, prediction_lstm_sample_r)
print('Accuracy: %.3f' % accuracy_lstm)

In [0]:
# Precision

precision_lstm = precision_score(y_test_sample, prediction_lstm_sample_r, average='binary')
print('Precision: %.3f' % precision_lstm)

In [0]:
# Recall

recall_lstm = recall_score(y_test_sample, prediction_lstm_sample_r, average='binary')
print('Recall: %.3f' % recall_lstm)

In [0]:
# F1-scores

score_lstm = f1_score(y_test_sample, prediction_lstm_sample_r, average='binary')
print('F-Measure: %.3f' % score_lstm)

## LSTM Attention Prediction

#### População

In [0]:
prediction_lstm_attention_sample = lstm_attention_model.predict_classes(airbnb_padded_reviews)

In [0]:
unique, counts = np.unique(prediction_lstm_attention_sample, return_counts=True)
dict(zip(unique, counts))

#### Amostra

In [0]:
prediction_lstm_attention_sample = lstm_attention_model.predict_classes(airbnb_padded_sample)

In [0]:
unique, counts = np.unique(prediction_lstm_attention_sample, return_counts=True)
dict(zip(unique, counts))

In [0]:
prediction_lstm_attention_sample_r = prediction_lstm_attention_sample.reshape(382, )
matrix_lstm_attention = confusion_matrix(y_test_sample, prediction_lstm_attention_sample_r)

In [0]:
# Accuracy

accuracy_lstm_attention = accuracy_score(y_test_sample, prediction_lstm_attention_sample_r)
print('Accuracy: %.3f' % accuracy_lstm_attention)

In [0]:
# Precision

precision_lstm_attention = precision_score(y_test_sample, prediction_lstm_attention_sample_r, average='binary')
print('Precision: %.3f' % precision_lstm_attention)

In [0]:
# Recall

recall_lstm_attention = recall_score(y_test_sample, prediction_lstm_attention_sample_r, average='binary')
print('Recall: %.3f' % recall_lstm_attention)

In [0]:
# F1-scores

score_lstm_attention = f1_score(y_test_sample, prediction_lstm_attention_sample_r, average='binary')
print('F-Measure: %.3f' % score_lstm_attention)

In [0]:
df_matrix_lstm_attention = pd.DataFrame(matrix_lstm, index = ['Negativo', 'Positivo'], columns= ['Negativo', 'Positivo'])
plt.figure(figsize=(8,7))
sn.set(font_scale=1.4)
sn.heatmap(df_matrix_lstm_attention, annot=True,  fmt='g', annot_kws={"size": 16})