In [30]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [31]:
import pandas as pd

train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"

df = pd.read_csv(train_path)

df_train = df.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                            'read_at', 'started_at', 'n_votes', 'n_comments'])

In [32]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords_english = stopwords.words('english')

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_review(text):
    text = text.lower()

    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'(view spoiler|hide spoiler)', '', text)

    tokens = text.split()

    clean_tokens = [WordNetLemmatizer().lemmatize(tok) for tok in tokens if
                    tok not in stopwords_english and len(tok) > 1]
    # clean_tokens = [tok for tok in tokens if tok not in stopwords_english and len(tok) > 1]

    clean_text = ' '.join(clean_tokens)

    return clean_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\enzol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
%%time

df_train["clean_text"] = df_train["review_text"].apply(preprocess_review)

CPU times: total: 7min 50s
Wall time: 7min 50s


In [34]:
from sklearn.model_selection import train_test_split

x_train = df_train["clean_text"].values
y_train = df_train["rating"].values

x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

print("Training data:", x_tr.shape, y_tr.shape)
print("Validation data:", x_va.shape, y_va.shape)

Training data: (720000,) (720000,)
Validation data: (180000,) (180000,)


In [35]:
%%time

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_tr)

max_seq_length = 250

x_tr_seq = tokenizer.texts_to_sequences(x_tr)
x_tr_seq = pad_sequences(x_tr_seq, maxlen=max_seq_length)

x_va_seq = tokenizer.texts_to_sequences(x_va)
x_va_seq = pad_sequences(x_va_seq, maxlen=max_seq_length)

CPU times: total: 1min 5s
Wall time: 1min 5s


In [36]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, Dense, BatchNormalization, Dropout
from keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=250))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(units=128)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(units=32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(units=6, activation='softmax'))

model.summary()

model.compile(optimizer=Adam(learning_rate=1e-4),
              loss='sparse_categorical_crossentropy',
              metrics='sparse_categorical_accuracy'
              )

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 250, 128)          2560000   
                                                                 
 conv1d_2 (Conv1D)           (None, 248, 64)           24640     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 bidirectional_4 (Bidirectio  (None, 124, 128)         66048     
 nal)                                                            
                                                                 
 batch_normalization_6 (Batc  (None, 124, 128)         512       
 hNormalization)                                                 
                                                      

In [37]:
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard

earlystopping_cb = EarlyStopping(patience=6, restore_best_weights=True)
tensorboard = TensorBoard("logs/tests/kaggle")

In [38]:
%%time

history = model.fit(x_tr_seq,
                    y_tr,
                    validation_data=(x_va_seq, y_va),
                    callbacks=[earlystopping_cb, tensorboard],
                    batch_size=512,
                    epochs=50,
                    verbose=1,
                    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
CPU times: total: 17min 46s
Wall time: 14min 24s


In [39]:
df_test = pd.read_csv(result_path, sep=",")

df_test.sample()

Unnamed: 0,user_id,book_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
413805,cbe851ae20560fc12501dc98af876fe3,12384972,80f6dbe3b5cc8d048e7ed492cd768328,"oh man, i am so glad i read this. i was workin...",Mon Jun 17 16:04:36 -0700 2013,Sat Jun 29 18:35:26 -0700 2013,Fri Jun 21 00:00:00 -0700 2013,Mon Jun 17 00:00:00 -0700 2013,1,1


In [40]:
%%time

df_test = df_test.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                                'read_at', 'started_at', 'n_votes', 'n_comments'])

df_test["clean_text"] = df_test["review_text"].apply(preprocess_review)

df_test = df_test.drop(columns=['review_text'])

df_test.head()

CPU times: total: 4min 16s
Wall time: 4min 16s


Unnamed: 0,review_id,clean_text
0,5c4df7e70e9b438c761f07a4620ccb7c,spoiler alert definitely one favorite among fo...
1,8eaeaf13213eeb16ad879a2a2591bbe5,spoiler alert drink im huge fan coffee dont bu...
2,dce649b733c153ba5363a0413cac988f,roar one favorite character never sky im happy...
3,8a46df0bb997269d6834f9437a4b0a77,spoiler alert feel like travelling europe dont...
4,d11d3091e22f1cf3cb865598de197599,star read enjoyed first two novel series say b...


In [41]:
x_te = df_test["clean_text"].values

x_te_seq = tokenizer.texts_to_sequences(x_te)
x_te_seq = pad_sequences(x_te_seq, maxlen=max_seq_length)

In [42]:
import numpy as np

predictions = [np.argmax(i) for i in model.predict(x_te_seq)]

# Create a new DataFrame to merge review ids and the model predictions
submission = pd.DataFrame({'review_id': df_test.review_id, 'rating': predictions})

# Check few random entries
submission.sample(10)



Unnamed: 0,review_id,rating
14318,825139461ff6c8e419fa7390a108a902,3
340129,596f74a9fb82e284cd9415eacf16a53e,5
61225,b9a74113fda274f7756a4e2c58d30594,5
90668,b203d43a92adb62e840fe775375c4ce5,4
90210,a4c943be8f9ca7d570ffded07f69893f,4
58839,a3ed506ebe9aef44a6823526da44ec8e,5
240846,ffa00db0748f3d3b66e262755bb7dce0,4
462700,9839e90bf1914a385479f9854bdfb4fb,4
325108,3cb084527e36990413af5d1672ea678a,4
136329,6d8e5a790569b562a0ba9476b449a9cf,4


In [43]:
submission.to_csv("res_files/submission.csv", index=None)

In [44]:
test_loss, test_acc = model.evaluate(x_va_seq,y_va)

print('Validation loss:', test_loss)
print('Validation accuracy:', test_acc)

Validation loss: 1.079634428024292
Validation accuracy: 0.5434277653694153
