In [57]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [58]:
import pandas as pd

train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"

df_train = pd.read_csv(train_path)

df_train.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                            'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)

In [59]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords_english = stopwords.words('english')

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_review(text):
    text = text.lower()

    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'(view spoiler|hide spoiler)', '', text)

    tokens = text.split()

    clean_tokens = [WordNetLemmatizer().lemmatize(tok) for tok in tokens if
                     tok not in stopwords_english and len(tok) > 1]
    # # clean_tokens = [tok for tok in tokens if tok not in stopwords_english and len(tok) > 1]

    clean_text = ' '.join(clean_tokens)

    return clean_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\enzol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
df_train["clean_text"] = df_train["review_text"].apply(preprocess_review)

In [61]:
from sklearn.model_selection import train_test_split

x_train = df_train["clean_text"].values
y_train = df_train["rating"].values

x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

print("Training data:", x_tr.shape, y_tr.shape)
print("Validation data:", x_va.shape, y_va.shape)

Training data: (720000,) (720000,)
Validation data: (180000,) (180000,)


In [62]:
caracteres = "abcdefghijklmnopqrstuvwxyz "

VOCAB_SIZE = 30
MAX_SEQ_LENGTH = 250
EMBEDDING_DIMS = 128

In [63]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE, char_level=True, filters=caracteres)
tokenizer.fit_on_texts(x_tr)

x_tr_seq = tokenizer.texts_to_sequences(x_tr)
x_tr_seq = pad_sequences(x_tr_seq, maxlen=MAX_SEQ_LENGTH)

x_va_seq = tokenizer.texts_to_sequences(x_va)
x_va_seq = pad_sequences(x_va_seq, maxlen=MAX_SEQ_LENGTH)

In [64]:
x_tr[0]

'e review'

In [65]:
x_tr.shape

(720000,)

In [66]:
x_tr_seq[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [67]:
x_tr_seq.shape

(720000, 250)

In [68]:
epochs = 100

model_nb = 2
hidden_layer = 2
learning_rate = 1e-3
batch_size = 1024
dropout_rate = 0.4

In [69]:
from keras.models import Model
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, Dense, BatchNormalization, Dropout, Input
from keras.optimizers import Adam

input_text = Input(shape=(MAX_SEQ_LENGTH,))

embedding_layer = Embedding(VOCAB_SIZE, output_dim=EMBEDDING_DIMS, input_length=MAX_SEQ_LENGTH)(input_text)

x = embedding_layer

for _ in range(hidden_layer):
    x = Bidirectional(LSTM(units=int(EMBEDDING_DIMS//2), dropout=dropout_rate, return_sequences=True))(x)

last = Bidirectional(LSTM(units=EMBEDDING_DIMS, dropout=dropout_rate))(x)
# model.add(BatchNormalization())
# model.add(Dropout(dropout_rate))

dense = Dense(units=32, activation='relu')(last)
# model.add(BatchNormalization())
# model.add(Dropout(dropout_rate))

output = Dense(units=6, activation='softmax')(dense)

rnn_model = Model(input_text, output)

rnn_model.summary()

rnn_model.compile(optimizer=Adam(learning_rate=learning_rate),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']
              )

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 250)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 250, 128)          3840      
                                                                 
 bidirectional_12 (Bidirecti  (None, 250, 128)         98816     
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, 250, 128)         98816     
 onal)                                                           
                                                                 
 bidirectional_14 (Bidirecti  (None, 256)              263168    
 onal)                                                           
                                                           

In [70]:
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard

earlystopping_cb = EarlyStopping(patience=6, restore_best_weights=True)
tensorboard = TensorBoard(f"logs/char/rnn_model_{model_nb}_hidden_layers_{hidden_layer}_lr_{learning_rate}_bs_{batch_size}_dr_{dropout_rate}")

In [None]:
rnn_model.fit(x_tr_seq,
                    y_tr,
                    validation_data=(x_va_seq, y_va),
                    callbacks=[tensorboard],
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100

In [None]:
df_test = pd.read_csv(result_path, sep=",")

df_test.sample()

In [None]:
df_test = df_test.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                                'read_at', 'started_at', 'n_votes', 'n_comments'])

df_test["clean_text"] = df_test["review_text"].apply(preprocess_review)

df_test = df_test.drop(columns=['review_text'])

df_test.head()

In [None]:
x_te = df_test["clean_text"].values

x_te_seq = tokenizer.texts_to_sequences(x_te)
x_te_seq = pad_sequences(x_te_seq, maxlen=MAX_SEQ_LENGTH)

In [None]:
import numpy as np

predictions = [np.argmax(i) for i in rnn_model.predict(x_te_seq)]

# Create a new DataFrame to merge review ids and the model predictions
submission = pd.DataFrame({'review_id': df_test.review_id, 'rating': predictions})

# Check few random entries
submission.sample(10)

In [None]:
submission.to_csv("res_files/submission.csv", index=None)

In [None]:
test_loss, test_acc = rnn_model.evaluate(x_va_seq,y_va)

print('Validation loss:', test_loss)
print('Validation accuracy:', test_acc)