In [3]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd

from utils import predicted_test_data_to_result_csv
from tensorflow.keras import layers
from tensorflow.keras import losses

In [4]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [5]:
# print(os.environ['LD_LIBRARY_PATH'])
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [6]:
df_train = pd.read_csv("data/base/goodreads_train.csv", sep=",")
df_train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


In [7]:
index = df_train[(df_train['rating'] == 0)].index
df_train.drop(index, inplace=True)
df_train.reset_index(inplace=True, drop=True)
df_train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1
4,8842281e1d1347389f2ab93d60773d4d,19398490,ea4a220b10e6b5c796dae0e3b970aff1,4,A beautiful story. It is rare to encounter a b...,Sun Jan 03 21:20:46 -0800 2016,Tue Sep 20 23:30:15 -0700 2016,Tue Sep 13 11:51:51 -0700 2016,Sat Aug 20 07:03:03 -0700 2016,35,5


In [8]:
target = df_train.pop('rating')

target = target - 1

target.head()

0    4
1    2
2    2
3    3
4    3
Name: rating, dtype: int64

In [9]:
features = df_train["review_text"]

features.head()

0    This is a special book. It started slow for ab...
1    Recommended by Don Katz. Avail for free in Dec...
2    A fun, fast paced science fiction thriller. I ...
3    I really enjoyed this book, and there is a lot...
4    A beautiful story. It is rare to encounter a b...
Name: review_text, dtype: object

In [10]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((features, target))
raw_train_ds = raw_train_ds.batch(32)

In [11]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    return tf.strings.regex_replace(stripped_spoilers,
                                    '[%s]' % re.escape(string.punctuation),
                                    '')

In [12]:
max_features = 10000
sequence_length = 100

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [13]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [14]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [15]:
train_ds = raw_train_ds.map(vectorize_text)

In [16]:
train_ds

<MapDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [17]:
embedding_dim = 200

In [18]:
model = tf.keras.Sequential()

model.add(layers.Embedding(max_features + 1, 50, input_length=sequence_length))

model.add(layers.Conv1D(64, 3, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(64, 3, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(2))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(128, 5, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(128, 5, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(256, 5, activation = 'relu', padding = 'same'))
model.add(layers.Conv1D(256, 5, activation = 'relu', padding = 'same'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Dropout(0.2))

model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(32, activation = 'relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(5, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           500050    
                                                                 
 conv1d (Conv1D)             (None, 100, 64)           9664      
                                                                 
 conv1d_1 (Conv1D)           (None, 100, 64)           12352     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 64)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 50, 64)            0         
                                                                 
 conv1d_2 (Conv1D)           (None, 50, 128)           41088     
                                                        

In [19]:
model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 20
history = model.fit(
    train_ds,
    epochs=epochs)

Epoch 1/20
 2586/27157 [=>............................] - ETA: 29:44 - loss: 1.3859 - accuracy: 0.3608

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

In [None]:
df_test = pd.read_csv("data/base/goodreads_test.csv", sep=",")

df_test_modified = df_test.drop(columns=[
    'user_id',
    'book_id',
    'review_id',
    'date_added',
    'date_updated',
    'read_at',
    'started_at',
    'n_votes',
    'n_comments'
], inplace=False)


In [None]:
# test_data_numpy = df_test.to_numpy()

predicted_test_data = export_model.predict(df_test_modified)

In [None]:
df_test.head()

In [None]:
predicted_test_data_to_result_csv(df_test, predicted_test_data)

export_model.save("saved_model/embedding_model_1")