In [1]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
import pandas as pd

from utils import predicted_test_data_to_result_csv
from keras import layers, losses, Input, Model
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.losses import sparse_categorical_crossentropy
from keras.metrics import sparse_categorical_accuracy
from keras.optimizers import Adam, SGD

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
train_path = "data/base/goodreads_train.csv"
result_path = "data/base/goodreads_test.csv"

In [5]:
df = pd.read_csv(train_path, sep=",")

Drop all lines with 0 in rating column

In [6]:
index = df[(df['rating'] == 0)].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

Targets DataFrames

In [7]:
targets = df.pop('rating')
targets = targets - 1
# targets = tf.keras.utils.to_categorical(targets)

Features DataFrames

In [8]:
features_names = ['review_text']
features = df[features_names]
tf.convert_to_tensor(features)

<tf.Tensor: shape=(869012, 1), dtype=string, numpy=
array([[b'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. (view spoiler)[Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would happen if our SETI stations received a message - if we found someone was out

In [9]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_spoilers = tf.strings.regex_replace(lowercase, '\*\* spoiler alert \*\*', ' ')
    return tf.strings.regex_replace(stripped_spoilers,
                                    '[%s]' % re.escape(string.punctuation),
                                    '')

In [10]:
max_features = 5000  # Maximum vocab size.
sequence_length = 100

In [11]:
vectorized_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [12]:
vectorized_layer.adapt(features)

In [13]:
epochs = 20
model_nb = 1

embedding_dim = 50
learning_rate = 0.0001
batch_size = 20000
dropout_rate = 0.0

In [14]:
input_text = Input(shape=(1,), dtype=tf.string)

vectorized_text = vectorized_layer(input_text)

embedding_layer = Embedding(max_features + 1, embedding_dim, input_length=sequence_length)(vectorized_text)

x = Conv1D(64, 3, activation = 'relu', padding = 'same')(embedding_layer)
x = Conv1D(64, 3, activation = 'relu', padding = 'same')(x)
x = MaxPooling1D(2)(x)

x = Conv1D(128, 5, activation = 'relu', padding = 'same')(x)
x = Conv1D(128, 5, activation = 'relu', padding = 'same')(x)
x = MaxPooling1D(3)(x)

x = Conv1D(256, 5, activation = 'relu', padding = 'same')(x)
x = Conv1D(256, 5, activation = 'relu', padding = 'same')(x)
x = MaxPooling1D(3)(x)

x = Conv1D(512, 5, activation = 'relu', padding = 'same')(x)
x = Conv1D(512, 5, activation = 'relu', padding = 'same')(x)
x = MaxPooling1D(3)(x)

global_max_pooling = GlobalMaxPooling1D()(x)

dense = Dense(32, activation='relu')(global_max_pooling)

output = Dense(5, activation='softmax')(dense)

cnn_model = Model(input_text, output)

cnn_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 50)           250050    
                                                                 
 conv1d (Conv1D)             (None, 100, 64)           9664      
                                                                 
 conv1d_1 (Conv1D)           (None, 100, 64)           12352     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 64)           0         
 )                                                           

In [15]:
cnn_model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate=learning_rate),
                  metrics=sparse_categorical_accuracy)

In [16]:
exp_name = f'cnn_model_{model_nb}_lr_{learning_rate}_bs_{batch_size}_dr_{dropout_rate}'

In [17]:
cnn_model.fit(features,
              targets,
              validation_split=0.25,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=[tf.keras.callbacks.TensorBoard("logs/cnn/" + exp_name)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x244bf2900d0>