In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GRUV2, SimpleRNN
from keras.models import load_model


In [None]:
# Carga del dataset de datos preprocesado
df = pd.read_csv('preprocessing_data.csv')
print(len(df))
df.head()

39983


Unnamed: 0,overall,reviewText,overall_label,processedReview
0,1.0,Deepak Chopra would like to introduce you to J...,1,deepak chopra would like introduce jesus not j...
1,1.0,this games sucks. spend your time on somthing ...,1,game suck spend time somthing else dont buy do...
2,1.0,Don't waste your money. Nothing here you can't...,1,don waste money nothing get free listening rai...
3,1.0,How I wish Amazon would make their own content...,1,how i wish amazon would make content compatibl...
4,1.0,"Moderately interesting plot, but extremely poo...",1,moderately interesting plot extremely poor cop...


In [None]:
# Se dividen los datos en trainy test
X_train, X_test, y_train, y_test = train_test_split(
    df['processedReview'],
    df['overall_label'],
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)

In [None]:
print('Muestras de train: \n {}\n'.format(X_train.iloc[:10]))
print('Etiquetas de train: \n {}'.format(y_train.iloc[:10]))

Muestras de train: 
 25265    play card game often like one getting advice g...
13743    this short story short it advertised seventy-o...
37137    this excellent story great plot excitement rom...
8220     stupid believe i read whole thing hoping would...
35829                                                great
37530                                            good book
11546    in short book complete disaster bit corny the ...
12607    i thought going really good variation went wro...
15601    this author writes style i care i even finish ...
19625    this book really great premise author failed e...
Name: processedReview, dtype: object

Etiquetas de train: 
 25265    0
13743    1
37137    0
8220     1
35829    0
37530    0
11546    1
12607    1
15601    1
19625    1
Name: overall_label, dtype: int64


In [None]:
# Se eliminan valores NaN
X_train = X_train.dropna()
print(len(X_train))

X_test = X_test.dropna()
print(len(X_test))

y_train = y_train.loc[X_train.index]
print(len(y_train))
y_test = y_test.loc[X_test.index]
print(len(y_test))

29985
9994
29985
9994


In [None]:
# Se ajusta el tokenizador a los datos de entrenamiento y se comvierte el texto en secuencia numérica
max_words = 500

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# Media de palabras en la reviews.
# RRN necesitamos establcer el tamaño de neruonas
# primero hacer el análisis para ver el número medio y máxcimo de palabras en las revies.
# si una frase se queda corta, las palabras restantes se rellenaran de 0. Esto sería negativo
max = 0
mean = []
for example in X_train:
  length = len(example)
  mean.append(len(example))
  if length > max:
    max = length

In [None]:
sum(mean) / len(mean)

26.001767550441887

In [None]:
max

1099

In [None]:
# Se rellenar o truncar las secuencias numéricas a la longitud deseada
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

## Modelo LSTM

In [None]:
# Se define el modelo LSTM
vocabulary_size = 5000

embedding_size = 32
model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model_lstm.add(LSTM(100))
model_lstm.add(Dense(1, activation='sigmoid'))

print(model_lstm.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 32)           160000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Se defienen parametros y se entrena el modelo
batch_size = 64
num_epochs = 1

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model_lstm.fit(X_train2, y_train2,
          validation_data=(X_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs)



<keras.callbacks.History at 0x7efae017a680>

In [None]:
# Se guarda el modelo LSTM
model_lstm.save('lstm_model.h5')

In [None]:
# Se comprueba que el modelo se a guardado correctamente y se testea el accuracy
loaded_model_lstm = load_model('lstm_model.h5')
scores = loaded_model_lstm.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Test accuracy: 0.8233940601348877


## Modelo GRU

In [None]:
embedding_size = 32
model_gru = Sequential()
model_gru.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model_gru.add(GRUV2(100))
model_gru.add(Dense(1, activation='sigmoid'))

print(model_gru.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 32)           160000    
                                                                 
 gru (GRU)                   (None, 100)               40200     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 200,301
Trainable params: 200,301
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
batch_size = 64
num_epochs = 1

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model_gru.fit(X_train2, y_train2,
          validation_data=(X_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs)



<keras.callbacks.History at 0x7efb4c0c1bd0>

In [None]:
model_gru.save('gru_model.h5')

In [None]:
loaded_model_gru = load_model('gru_model.h5')

In [None]:
scores = loaded_model_gru.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Test accuracy: 0.8116869926452637


## Modelo RNN

In [None]:
embedding_size = 32
model_rnn = Sequential()
model_rnn.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model_rnn.add(SimpleRNN(100))
model_rnn.add(Dense(1, activation='sigmoid'))

print(model_rnn.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 32)           160000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               13300     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 173,401
Trainable params: 173,401
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
batch_size = 64
num_epochs = 1

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model_rnn.fit(X_train2, y_train2,
          validation_data=(X_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs)



<keras.callbacks.History at 0x7efb47f35180>

In [None]:
model_rnn.save('rnn_model.h5')

In [None]:
loaded_model_rnn = load_model('rnn_model.h5')

In [None]:
scores = loaded_model_rnn.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Test accuracy: 0.7942765951156616


## Conclusiones

Las métricas de precisión en el conjunto de prueba para los tres modelos:

Modelo LSTM:

Precisión en el conjunto de prueba: 0.8234

Modelo GRU:

Precisión en el conjunto de prueba: 0.8117

Modelo SimpleRNN:

Precisión en el conjunto de prueba: 0.7943

En terminon de precisión, comparanto estos tres modelo el mejor en resultados a sido el de LSTM. Esto seguramente se deba a la naturaleza de los datos con los que se ha probado los entrenamiento.

En muchas ocasiones para datos más simples funcionan mejor los modelos menos complejos.