In [None]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import pandas as pd

train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"

df = pd.read_csv(train_path)

df_train = df.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                            'read_at', 'started_at', 'n_votes', 'n_comments'])

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords_english = stopwords.words('english')

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_review(text):
    text = text.lower()

    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'(view spoiler|hide spoiler)', '', text)

    tokens = text.split()

    clean_tokens = [WordNetLemmatizer().lemmatize(tok) for tok in tokens if
                    tok not in stopwords_english and len(tok) > 1]
    # clean_tokens = [tok for tok in tokens if tok not in stopwords_english and len(tok) > 1]

    clean_text = ' '.join(clean_tokens)

    return clean_text

In [None]:
%%time

df_train["clean_text"] = df_train["review_text"].apply(preprocess_review)

In [None]:
from sklearn.model_selection import train_test_split

x_train = df_train["clean_text"].values
y_train = df_train["rating"].values

x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

print("Training data:", x_tr.shape, y_tr.shape)
print("Validation data:", x_va.shape, y_va.shape)

In [None]:
VOCAB_SIZE = 20000
MAX_SEQ_LENGTH = 250
EMBEDDING_DIMS = 64

In [None]:
%%time

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(x_tr)



x_tr_seq = tokenizer.texts_to_sequences(x_tr)
x_tr_seq = pad_sequences(x_tr_seq, maxlen=MAX_SEQ_LENGTH)

x_va_seq = tokenizer.texts_to_sequences(x_va)
x_va_seq = pad_sequences(x_va_seq, maxlen=MAX_SEQ_LENGTH)

In [None]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, Dense, BatchNormalization, Dropout, MultiHeadAttention, LayerNormalization,Input, GlobalAveragePooling1D
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

def transformer_test() -> Sequential:
    inputs = Input(shape=(MAX_SEQ_LENGTH,))
    x = Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS)(inputs)
    for _ in range(2):
        x = transformer_encoder(x, 32, 2, 32, 0)
    x = GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in [32]:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(0.1)(x)
    outputs = Dense(6, activation="softmax")(x)
    model = Model(inputs, outputs)

    model.compile(
        optimizer=Adam(1e-2),
        loss=SparseCategoricalCrossentropy(),
        metrics=["accuracy"])
    return model

In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard

earlystopping_cb = EarlyStopping(patience=6, restore_best_weights=True)
tensorboard = TensorBoard("logs/tests/kaggle_1")

In [None]:
%%time

transformer_model = transformer_test()

transformer_model.fit(x_tr_seq,
                    y_tr,
                    validation_data=(x_va_seq, y_va),
                    callbacks=[earlystopping_cb, tensorboard],
                    batch_size=800,
                    epochs=50,
                    verbose=1,
                    )

In [None]:
df_test = pd.read_csv(result_path, sep=",")

df_test.sample()

In [None]:
%%time

df_test = df_test.drop(columns=['user_id', 'book_id', 'date_added', 'date_updated',
                                'read_at', 'started_at', 'n_votes', 'n_comments'])

df_test["clean_text"] = df_test["review_text"].apply(preprocess_review)

df_test = df_test.drop(columns=['review_text'])

df_test.head()

In [None]:
x_te = df_test["clean_text"].values

x_te_seq = tokenizer.texts_to_sequences(x_te)
x_te_seq = pad_sequences(x_te_seq, maxlen=MAX_SEQ_LENGTH)

In [None]:
import numpy as np

predictions = [np.argmax(i) for i in transformer_model.predict(x_te_seq)]

# Create a new DataFrame to merge review ids and the model predictions
submission = pd.DataFrame({'review_id': df_test.review_id, 'rating': predictions})

# Check few random entries
submission.sample(10)

In [None]:
submission.to_csv("res_files/submission.csv", index=None)

In [None]:
test_loss, test_acc = transformer_model.evaluate(x_va_seq,y_va)

print('Validation loss:', test_loss)
print('Validation accuracy:', test_acc)

Validation loss: 1.0374062061309814
Validation accuracy: 0.5549389123916626