In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from Model import *
from sklearn.metrics import classification_report
import tensorflow_addons as tfa
from keras.utils import io_utils
from pandarallel import pandarallel
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
pandarallel.initialize(progress_bar=True, nb_workers=16)
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
#train['review_text'] = train['review_text'].str.replace('[^\w\s]','')
train = shuffle(train)

In [3]:
model = transf6.model(vocabulary, dropout_rate=0.0, kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None)
print(model.summary())

Model: "transformer1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 352)         0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 token_and_position_embedding (  (None, 352, 220)    24167220    ['text_vectorization[0][0]']     
 TokenAndPositionEmbedding)                                                                       
                                                                                       

In [3]:
def convert_timestamp(x):
    import pandas as pd
    if pd.isna(x): # parallel_apply
        return 0.0
    else:
        try:
            return float(pd.Timestamp(x).value / 10**18)
        except:
            return 0
train[['read_at','date_added','date_updated' ,'started_at']] = train[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=225000), Label(value='0 / 225000')…

In [4]:
inputs_data = train[['review_text','n_comments', 'n_votes','read_at','date_added','date_updated','started_at']]
outputs_data = keras.utils.to_categorical(train['rating'], num_classes=6)
train_in, validation_in, train_out, validation_out = train_test_split(inputs_data, outputs_data, test_size=0.2)

In [5]:
train_in = [train_in['review_text'], train_in['n_comments'], train_in['n_votes'], train_in['read_at'], train_in['date_added'], train_in['date_updated'], train_in['started_at']]
validation_in = [validation_in['review_text'], validation_in['n_comments'], validation_in['n_votes'], validation_in['read_at'], validation_in['date_added'], validation_in['date_updated'], validation_in['started_at']]

In [6]:
model_list = [transf6]
kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)
bias_regularizer=regularizers.L2(1e-4)
activity_regularizer=regularizers.L2(1e-5)
params=[
    #{"dropout_rate": .0,"kernel_regularizer": None, "bias_regularizer": None, "activity_regularizer": None},
    {"dropout_rate": .2,"kernel_regularizer": None, "bias_regularizer": None, "activity_regularizer": None},
    #{"dropout_rate": .0,"kernel_regularizer": regularizers.L1L2(l1=1e-5, l2=1e-4),
     #"bias_regularizer": regularizers.L2(1e-4),
     #"activity_regularizer": regularizers.L2(1e-5)},
    #{"dropout_rate": .3,"kernel_regularizer": regularizers.L1L2(l1=1e-5, l2=1e-4),
    # "bias_regularizer": regularizers.L2(1e-4),
     #"activity_regularizer": regularizers.L2(1e-5)}
]

def scheduler(epoch, lr):
    if epoch < 1:
        return lr
    else:
        return lr * tf.math.exp(-0.22)
seeds = [42]
for seed in seeds:
    keras.utils.set_random_seed(seed)
    for model_obj in model_list:
        for param in params:

            dropout_rate = param['dropout_rate']
            kernel_regularizer = param['kernel_regularizer']
            bias_regularizer = param['bias_regularizer']
            activity_regularizer = param['activity_regularizer']

            if kernel_regularizer is None:
                regularizers_ = "None"
            else:
                regularizers_ = "L1L2"
            if dropout_rate == .0:
                dropout = "0"
            else:
                dropout = "3"

            model = model_obj.model(vocabulary, dropout_rate, kernel_regularizer, bias_regularizer, activity_regularizer)
            model.compile(optimizer=keras.optimizers.Adamax(learning_rate=0.001),
                           loss=keras.losses.categorical_crossentropy,
                           metrics=[keras.metrics.categorical_accuracy, tfa.metrics.F1Score(num_classes=6, average='weighted')]
                           )
            print(model.name)
            print(model.summary())

            if not os.path.exists(f"checkpoint/{model.name}/"):
                os.mkdir(f"checkpoint/{model.name}")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/{dropout}"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/{dropout}")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}")

            if not os.path.exists(f"logs/{model.name}/"):
                os.mkdir(f"logs/{model.name}")
            if not os.path.exists(f"logs/{model.name}/{seed}/"):
                os.mkdir(f"logs/{model.name}/{seed}/")
            if not os.path.exists(f"logs/{model.name}/{seed}/{dropout}"):
                os.mkdir(f"logs/{model.name}/{seed}/{dropout}")
            if not os.path.exists(f"logs/{model.name}/{seed}/{dropout}/{regularizers_}"):
                os.mkdir(f"logs/{model.name}/{seed}/{dropout}/{regularizers_}")

            chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}/model.h5',
            monitor='val_f1_score', mode='max', save_best_only=True)

            sheduler = keras.callbacks.LearningRateScheduler(scheduler,0)
            tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{model.name}/{seed}/{dropout}/{regularizers_}/")
            #model.fit(x=train_in, y=train_out, validation_data=(validation_in, validation_out), batch_size=128, epochs=9, callbacks=[sheduler, chekpoint, tensorboard_callback])

transformer1
Model: "transformer1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 352)         0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 token_and_position_embedding (  (None, 352, 220)    24167220    ['text_vectorization[0][0]']     
 TokenAndPositionEmbedding)                                                                       
                                                                          

In [None]:
model.save('transf5')