In [None]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd

from utils import predicted_test_data_to_result_csv
from tensorflow.keras import layers
from tensorflow.keras import losses

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
df_train = pd.read_csv("data/base/goodreads_train.csv", sep=",")
df_train.head()

In [None]:
index = df_train[(df_train['rating'] == 0)].index
df_train.drop(index, inplace=True)
df_train.reset_index(inplace=True, drop=True)
df_train.head()

In [None]:
target = df_train.pop('rating')

target = target - 1

target.head()

In [None]:
features = df_train["review_text"]

features.head()
# for text_batch, label_batch in raw_train_ds.take(1):
#   for i in range(3):
#     print("Review", text_batch.numpy()[i])
#     print("Label", label_batch.numpy()[i])

In [None]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((features, target))

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    stripped_spoilers_1 = tf.strings.regex_replace(stripped_spoilers, '(hide spoiler)', ' ')
    stripped_spoilers_2 = tf.strings.regex_replace(stripped_spoilers_1, '(view spoiler)', ' ')
    return tf.strings.regex_replace(stripped_spoilers_2,
                                    '[%s]' % re.escape(string.punctuation),
                                    '')

In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [11]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

KeyboardInterrupt: 

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
train_ds = raw_train_ds.map(vectorize_text)

In [None]:
embedding_dim = 16

In [None]:
input = tf.keras.layers.Input(shape=(250,))
embedding = layers.Embedding(input_dim=max_features + 1,
                             output_dim=embedding_dim,
                             input_length=max_features,
                             input_shape=(max_features,))(input)

conv = layers.Conv1D(filters=100, kernel_size=3, activation='relu')(embedding)
pool = layers.MaxPool1D(pool_size=2, strides=2)(conv)
flat = layers.Flatten()(pool)
out = layers.Dense(1, activation='sigmoid')(flat)

model = tf.keras.Model(inputs=input, outputs=out)

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
epochs = 20
history = model.fit(
    train_ds,
    epochs=epochs)

export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

df_test = pd.read_csv("data/base/goodreads_test.csv", sep=",")

df_test_modified = df_test.drop(columns=[
    'user_id',
    'book_id',
    'review_id',
    'date_added',
    'date_updated',
    'read_at',
    'started_at',
    'n_votes',
    'n_comments'
], inplace=False)


# test_data_numpy = df_test.to_numpy()

predicted_test_data = export_model.predict(df_test_modified)

df_test.head()

predicted_test_data_to_result_csv(df_test, predicted_test_data)