In [141]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from utils import predicted_test_data_to_result_csv
from keras import layers, losses, Input, Model
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization, Activation, Flatten
from keras.losses import sparse_categorical_crossentropy
from keras.metrics import sparse_categorical_accuracy
from keras.optimizers import Adam, SGD

In [142]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [143]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [144]:
train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"

In [145]:
df = pd.read_csv(train_path, sep=",")

In [147]:
targets = df.pop('rating')
# targets = tf.keras.utils.to_categorical(targets)

In [148]:
features_names = ['review_text']
features = df[features_names]
tf.convert_to_tensor(features)

<tf.Tensor: shape=(869012, 1), dtype=string, numpy=
array([[b'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. (view spoiler)[Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would happen if our SETI stations received a message - if we found someone was out

In [None]:
nltk.download('stopwords')
stopwords = stopwords.words('english')

In [149]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    stripped_ponctuation = tf.strings.regex_replace(stripped_spoilers, "[%s]" % re.escape(string.punctuation), "")
    data = []
    for i in stopwords:
        data = tf.strings.regex_replace(stripped_ponctuation, f' {i} ', " ")
    return data

In [150]:
max_features = 5000  # Maximum vocab size.
sequence_length = 100

In [151]:
vectorized_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [152]:
vectorized_layer.adapt(features)

In [153]:
epochs = 50
model_nb = 1

embedding_dim = 50
learning_rate = 0.001
batch_size = 8000
dropout_rate = 0.0

In [154]:
input_text = Input(shape=(1,), dtype=tf.string)

vectorized_text = vectorized_layer(input_text)

embedding_layer = Embedding(max_features + 1, embedding_dim, input_length=sequence_length)(vectorized_text)

x_shortcut = embedding_layer

#### Main path ####
# First
x = Conv1D(64, 3, activation='relu', padding = 'valid')(embedding_layer)
x = BatchNormalization()(x)
x = Activation('relu')(x)

# Second
x = Conv1D(128, 5, activation='relu', padding = 'valid')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)

# Third
x = Conv1D(256, 5, activation='relu', padding = 'valid')(x)
x = BatchNormalization()(x)

#### Shortcut path ####
x_shortcut = Conv1D(256, 11, activation='relu', padding = 'valid')(x_shortcut)
x_shortcut = BatchNormalization()(x_shortcut)

# x and x_shortcut addition
x = x + x_shortcut

global_max_pooling = GlobalMaxPooling1D()(x)

relu = Dense(32, activation='relu')(global_max_pooling)

output = Dense(6, activation='softmax')(relu)

resnet_model = Model(input_text, output)

resnet_model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_10 (TextVec  (None, 100)         0           ['input_12[0][0]']               
 torization)                                                                                      
                                                                                                  
 embedding_8 (Embedding)        (None, 100, 50)      250050      ['text_vectorization_10[0][0]']  
                                                                                                  
 conv1d_32 (Conv1D)             (None, 98, 64)       9664        ['embedding_8[0][0]']      

In [155]:
resnet_model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate=learning_rate),
                  metrics=sparse_categorical_accuracy)

In [156]:
exp_name = f'resnet_model_{model_nb}_lr_{learning_rate}_bs_{batch_size}_dr_{dropout_rate}'

In [157]:
resnet_model.fit(features,
              targets,
              validation_split=0.25,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=[tf.keras.callbacks.TensorBoard("logs/resnets/" + exp_name)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50

KeyboardInterrupt: 

In [None]:
df_test = pd.read_csv(result_path, sep=",")

df_test_modified = df_test.drop(columns=[
    'user_id',
    'book_id',
    'review_id',
    'date_added',
    'date_updated',
    'read_at',
    'started_at',
    'n_votes',
    'n_comments'
], inplace=False)

In [None]:
predicted_test_data = resnet_model.predict(df_test_modified)

In [None]:
df_test.head()

In [None]:
predicted_test_data_to_result_csv(df_test, predicted_test_data, exp_name)