In [28]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from utils import predicted_test_data_to_result_csv
from keras import layers, losses, Input, Model
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization, Activation, \
    Flatten, LSTM, SpatialDropout1D, Bidirectional, MultiHeadAttention, LayerNormalization, Lambda, \
    GlobalAveragePooling1D, Dropout
from keras.losses import sparse_categorical_crossentropy
from keras.metrics import sparse_categorical_accuracy
from keras.optimizers import Adam, SGD
from keras.models import Sequential

In [29]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [30]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [31]:
train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"
frac_ratio = 0.2

In [32]:
max_features = 10000  # Maximum vocab size.
sequence_length = 200

In [33]:
df = pd.read_csv(train_path, sep=",")

In [34]:
index = df[(df['rating'] == 0)].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

In [35]:
x_train = df.sample(frac=frac_ratio)
x_val = df.drop(x_train.index)

In [36]:
y_train = x_train.pop('rating')
y_train = y_train - 1

y_val = x_val.pop('rating')
y_val = y_val - 1

In [37]:
x_train = x_train["review_text"]
x_val = x_val["review_text"]

In [38]:
raw_train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10, reshuffle_each_iteration=False)
raw_val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(10, reshuffle_each_iteration=False)

In [39]:
nltk.download('stopwords')
stopwords = stopwords.words('english')
# stopwords = stopwords.extend(['d', 'll', 're', 's', 've'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enzol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    stripped_ponctuation = tf.strings.regex_replace(stripped_spoilers, "[%s]" % re.escape(string.punctuation), "")
    data = []
    for i in stopwords:
        data = tf.strings.regex_replace(stripped_ponctuation, f' {i} ', " ")
    return data

In [41]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    pad_to_max_tokens=True,
    output_sequence_length=sequence_length)

In [42]:
epochs = 100
model_nb = 1

embedding_dim = 50 # Embedding size for each token
num_heads = 8 # Number of attention heads
ff_dim = 64  # Hidden layer size in feed forward network inside transformer
learning_rate = 0.007
batch_size = 800
dropout_rate = 0.4

In [43]:
raw_train_dataset = raw_train_dataset.batch(batch_size=batch_size)
raw_val_dataset = raw_val_dataset.batch(batch_size=batch_size)

In [44]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

val_text = raw_val_dataset.map(lambda x, y: x)
vectorize_layer.adapt(val_text)

In [45]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [46]:
train_ds = raw_train_dataset.map(vectorize_text)
val_ds = raw_val_dataset.map(vectorize_text)

In [47]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

input_text = Input(shape=(sequence_length,))

embedding_layer = Embedding(max_features + 1, embedding_dim, input_length=sequence_length)(input_text)

x = embedding_layer
for _ in range(6):
    old = x
    x = LayerNormalization()(x)
    x = MultiHeadAttention(num_heads, embedding_dim)(x, x)
    x = x + old
    old = x
    x = LayerNormalization()(x)
    x = Dense(embedding_dim * 2, activation="relu")(x)
    x = Dense(embedding_dim, activation="relu")(x)
    x = x + old

mlp_head_input = Lambda(lambda x: x[:, 0])(x)

dense = Dense(512, activation='relu')(mlp_head_input)

output = Dense(5, activation='softmax')(dense)

transformer_model = Model(input_text, output)

transformer_model.summary()

In [48]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, ff_dim, rate=0):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation='relu'),
             Dense(embedding_dim), ]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)

        return self.layernorm2(out1 + ffn_output)

In [49]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embedding_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)

        return x + positions

In [50]:
inputs = Input(shape=(sequence_length,))
embedding_layer = TokenAndPositionEmbedding(maxlen=sequence_length, vocab_size=max_features, embedding_dim=embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(dropout_rate)(x)
x = Dense(30, activation='relu')(x)
x = Dropout(dropout_rate)(x)
outputs = Dense(5, activation='softmax')(x)

transformer_model = Model(inputs=inputs, outputs=outputs)

In [51]:
transformer_model.compile(loss=sparse_categorical_crossentropy,
                          optimizer=Adam(learning_rate=learning_rate),
                          metrics=sparse_categorical_accuracy)

In [52]:
exp_name = f'transformer_model_with_stopwords_{model_nb}_num_heads_{num_heads}_emb_dim_{embedding_dim}_ff_dim_{ff_dim}_lr_{learning_rate}_bs_{batch_size}_dr_{dropout_rate}'

In [None]:
transformer_model.fit(train_ds,
                      validation_data=val_ds,
                      epochs=epochs,
                      callbacks=[
                          tf.keras.callbacks.TensorBoard("logs/transformer/" + exp_name),
                          tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
                                 ])

Layer TokenAndPositionEmbedding has arguments ['maxlen', 'vocab_size', 'embedding_dim']
in `__init__` and therefore must override `get_config()`.

Example:

class CustomLayer(keras.layers.Layer):
    def __init__(self, arg1, arg2):
        super().__init__()
        self.arg1 = arg1
        self.arg2 = arg2

    def get_config(self):
        config = super().get_config()
        config.update({
            "arg1": self.arg1,
            "arg2": self.arg2,
        })
        return config
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

In [None]:
df_test = pd.read_csv(result_path, sep=",")

df_test_modified = df_test.drop(columns=[
    'user_id',
    'book_id',
    'review_id',
    'date_added',
    'date_updated',
    'read_at',
    'started_at',
    'n_votes',
    'n_comments'
], inplace=False)

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    transformer_model
])

In [None]:
predicted_test_data = export_model.predict(df_test_modified)

In [None]:
df_test.head()

In [None]:
predicted_test_data_to_result_csv(df_test, predicted_test_data, exp_name)