In [61]:
import keras.metrics
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd

from utils import predicted_test_data_to_result_csv
from keras import layers
from keras import losses

In [62]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [63]:
# print(os.environ['LD_LIBRARY_PATH'])
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [64]:
train_path = "data/base/goodreads_train.csv"
test_path = "data/base/goodreads_test.csv"

epochs = 10
model_nb = 1

max_features = 5000
sequence_length = 100

embedding_dim = 50
learning_rate = 0.0001
batch_size = 10000
dropout_rate = 0.0

In [65]:
df_train = pd.read_csv(train_path, sep=",")

In [66]:
index = df_train[(df_train['rating'] == 0)].index
df_train.drop(index, inplace=True)
df_train.reset_index(inplace=True, drop=True)

In [67]:
target = df_train.pop('rating')
target = target - 1

In [68]:
features = df_train["review_text"]

In [69]:
all_data = tf.data.Dataset.from_tensor_slices((features, target)).shuffle(10, reshuffle_each_iteration=False)

In [70]:
def is_test(x, y):
    return x % 4 == 0

def is_train(x, y):
    return not is_test(x, y)

recover = lambda x,y: y

raw_validation_dataset = all_data.enumerate() \
                    .filter(is_test) \
                    .map(recover)

raw_train_dataset = all_data.enumerate() \
                    .filter(is_train) \
                    .map(recover)

In [71]:
raw_train_dataset = raw_train_dataset.batch(batch_size=batch_size)
raw_validation_dataset = raw_validation_dataset.batch(batch_size=batch_size)

In [72]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    return tf.strings.regex_replace(stripped_spoilers,
                                    '[%s]' % re.escape(string.punctuation),
                                    '')

In [73]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [74]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

validation_text = raw_validation_dataset.map(lambda x, y: x)
vectorize_layer.adapt(validation_text)

In [75]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [76]:
train_ds = raw_train_dataset.map(vectorize_text)
validation_ds = raw_validation_dataset.map(vectorize_text)

In [77]:
model = tf.keras.Sequential()

model.add(layers.Embedding(max_features + 1, embedding_dim, input_length=sequence_length))

model.add(layers.Conv1D(64, 3, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(64, 3, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3)) # 2 => 3
# model.add(layers.Dropout(dropout_rate))

model.add(layers.Conv1D(128, 5, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(128, 5, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3))
# model.add(layers.Dropout(dropout_rate))

model.add(layers.Conv1D(256, 5, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(256, 5, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3))
# model.add(layers.Dropout(dropout_rate))

model.add(layers.Conv1D(512, 5, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(512, 5, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3))
# model.add(layers.Dropout(dropout_rate))

model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(32, activation = 'relu'))
# model.add(layers.Dropout(dropout_rate))
model.add(layers.Dense(5, activation = 'softmax'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 50)           250050    
                                                                 
 conv1d_16 (Conv1D)          (None, 100, 64)           9664      
                                                                 
 conv1d_17 (Conv1D)          (None, 100, 64)           12352     
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 33, 64)           0         
 1D)                                                             
                                                                 
 conv1d_18 (Conv1D)          (None, 33, 128)           41088     
                                                                 
 conv1d_19 (Conv1D)          (None, 33, 128)           82048     
                                                      

In [78]:
model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=keras.metrics.sparse_categorical_accuracy)

In [79]:
exp_name = f'conv_net_model_{model_nb}_lr_{learning_rate}_bs_{batch_size}_emb_{embedding_dim}_dr_{dropout_rate}'

In [80]:
history = model.fit(
    train_ds,
    callbacks=[tf.keras.callbacks.TensorBoard("logs/" + exp_name)],
    validation_data=validation_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model
])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=keras.metrics.sparse_categorical_accuracy
)

In [82]:
df_test = pd.read_csv(test_path, sep=",")

df_test_modified = df_test.drop(columns=[
    'user_id',
    'book_id',
    'review_id',
    'date_added',
    'date_updated',
    'read_at',
    'started_at',
    'n_votes',
    'n_comments'
], inplace=False)


In [83]:
# test_data_numpy = df_test.to_numpy()

predicted_test_data = export_model.predict(df_test_modified)



In [84]:
df_test.head()

Unnamed: 0,user_id,book_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,b9450d1c1f97f891c392b1105959b56e,7092507,5c4df7e70e9b438c761f07a4620ccb7c,** spoiler alert ** \n This is definitely one ...,Sat Nov 10 06:06:13 -0800 2012,Sun Nov 11 05:38:36 -0800 2012,Sun Nov 11 05:38:36 -0800 2012,Sat Nov 10 00:00:00 -0800 2012,1,0
1,b9450d1c1f97f891c392b1105959b56e,5576654,8eaeaf13213eeb16ad879a2a2591bbe5,"** spoiler alert ** \n ""You are what you drink...",Fri Nov 09 21:55:16 -0800 2012,Sat Nov 10 05:41:49 -0800 2012,Sat Nov 10 05:41:49 -0800 2012,Fri Nov 09 00:00:00 -0800 2012,1,0
2,b9450d1c1f97f891c392b1105959b56e,15754052,dce649b733c153ba5363a0413cac988f,Roar is one of my favorite characters in Under...,Fri Nov 09 00:25:50 -0800 2012,Sat Nov 10 06:14:10 -0800 2012,Sat Nov 10 06:14:10 -0800 2012,Fri Nov 09 00:00:00 -0800 2012,0,0
3,b9450d1c1f97f891c392b1105959b56e,17020,8a46df0bb997269d6834f9437a4b0a77,** spoiler alert ** \n If you feel like travel...,Thu Nov 01 00:28:39 -0700 2012,Sat Nov 03 11:35:22 -0700 2012,Sat Nov 03 11:35:22 -0700 2012,Thu Nov 01 00:00:00 -0700 2012,0,0
4,b9450d1c1f97f891c392b1105959b56e,12551082,d11d3091e22f1cf3cb865598de197599,3.5 stars \n I read and enjoyed the first two ...,Thu Oct 18 00:57:00 -0700 2012,Mon Apr 01 23:00:51 -0700 2013,Sat Mar 30 00:00:00 -0700 2013,Fri Mar 29 00:00:00 -0700 2013,0,0


In [85]:
predicted_test_data_to_result_csv(df_test, predicted_test_data, exp_name)

export_model.save("saved_model/embedding_model_1")