In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from Model import *
from sklearn.metrics import classification_report
import tensorflow_addons as tfa
from keras.utils import io_utils
from pandarallel import pandarallel
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
pandarallel.initialize(progress_bar=True, nb_workers=16)
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
#train['review_text'] = train['review_text'].str.replace('[^\w\s]','')
train = shuffle(train)

In [3]:
def convert_timestamp(x):
    import pandas as pd
    if pd.isna(x):#parallel_apply
        return 0.0
    else:
        try:
            return float(pd.Timestamp(x).value / 10**18)
        except:
            return 0
train[['read_at','date_added','date_updated' ,'started_at']] = train[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=225000), Label(value='0 / 225000')…

In [4]:
inputs_data = train[['review_text','n_comments', 'n_votes','read_at','date_added','date_updated','started_at']]
outputs_data = keras.utils.to_categorical(train['rating'], num_classes=6)
train_in, validation_in, train_out, validation_out = train_test_split(inputs_data, outputs_data, test_size=0.2)

In [5]:
train_in = [train_in['review_text'], train_in['n_comments'], train_in['n_votes'], train_in['read_at'], train_in['date_added'], train_in['date_updated'], train_in['started_at']]
validation_in = [validation_in['review_text'], validation_in['n_comments'], validation_in['n_votes'], validation_in['read_at'], validation_in['date_added'], validation_in['date_updated'], validation_in['started_at']]

In [None]:
model_list = [unet10]
kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)
bias_regularizer=regularizers.L2(1e-4)
activity_regularizer=regularizers.L2(1e-5)
params=[
    #{"dropout_rate": .15,"kernel_regularizer": None, "bias_regularizer": None, "activity_regularizer": None},

     #{"dropout_rate": .3,"kernel_regularizer": None, "bias_regularizer": None, "activity_regularizer": None},
    # {"dropout_rate": .0,"kernel_regularizer": regularizers.L1L2(l1=1e-5, l2=1e-4),
    #  "bias_regularizer": regularizers.L2(1e-4),
    #  "activity_regularizer": regularizers.L2(1e-5)},
     {"dropout_rate": .15,"kernel_regularizer": regularizers.L1L2(l1=1e-6, l2=1e-5),
      "bias_regularizer": regularizers.L2(1e-5),
      "activity_regularizer": regularizers.L2(1e-6)}
]

def scheduler(epoch, lr):
    if epoch < 1:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
seeds = [42]
for seed in seeds:
    keras.utils.set_random_seed(seed)
    for model_obj in model_list:
        for param in params:

            dropout_rate = param['dropout_rate']
            kernel_regularizer = param['kernel_regularizer']
            bias_regularizer = param['bias_regularizer']
            activity_regularizer = param['activity_regularizer']

            if kernel_regularizer is None:
                regularizers_ = "None"
            else:
                regularizers_ = "L1L2"
            if dropout_rate == .0:
                dropout = "0"
            else:
                dropout = f"{dropout_rate}"

            model = model_obj.model(vocabulary, dropout_rate, kernel_regularizer, bias_regularizer, activity_regularizer)
            model.compile(optimizer=keras.optimizers.Adamax(learning_rate=0.01),
                           loss=keras.losses.categorical_crossentropy,
                           metrics=[keras.metrics.categorical_accuracy, tfa.metrics.F1Score(num_classes=6, average='weighted')]
                           )
            print(model.name)
            print(model.summary())

            if not os.path.exists(f"checkpoint/{model.name}/"):
                os.mkdir(f"checkpoint/{model.name}")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/{dropout}"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/{dropout}")
            if not os.path.exists(f"checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}"):
                os.mkdir(f"checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}")

            if not os.path.exists(f"logs/{model.name}/"):
                os.mkdir(f"logs/{model.name}")
            if not os.path.exists(f"logs/{model.name}/{seed}/"):
                os.mkdir(f"logs/{model.name}/{seed}/")
            if not os.path.exists(f"logs/{model.name}/{seed}/{dropout}"):
                os.mkdir(f"logs/{model.name}/{seed}/{dropout}")
            if not os.path.exists(f"logs/{model.name}/{seed}/{dropout}/{regularizers_}"):
                os.mkdir(f"logs/{model.name}/{seed}/{dropout}/{regularizers_}")

            # if not os.path.exists(f"logs/{model.name}/{epsilone}"):
            #     os.mkdir(f"logs/{model.name}/{epsilone}")

            chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/{model.name}/{seed}/{dropout}/{regularizers_}/model.h5',
            monitor='val_f1_score', mode='max', save_best_only=True)

            sheduler = keras.callbacks.LearningRateScheduler(scheduler,0)
            stop_nan = keras.callbacks.TerminateOnNaN()

            tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{model.name}/{seed}/{dropout}/{regularizers_}/")
            #tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{model.name}/{seed}/{epsilone}/")
            model.fit(x=train_in, y=train_out, validation_data=(validation_in, validation_out), batch_size=128, epochs=15, callbacks=[sheduler, stop_nan])

unet10
Epoch 1/15
Epoch 2/15

In [None]:
model = unet5.Model(vocabulary)
model.model.load_weights('checkpoint/unet5/')

In [None]:
res = model.model.predict(train_in)

In [None]:
val_res = model.model.predict(validation_in)

In [None]:
val_out = np.argmax(validation_out, axis=1)
train_out = np.argmax(train_out, axis=1)


In [None]:
test[['read_at','date_added','date_updated' ,'started_at']] = test[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

test_data = [test['review_text'], test['n_comments'], test['n_votes'], test['read_at'], test['date_added'], test['date_updated'], test['started_at']]
restest = model.model.predict(test_data)

In [None]:
ff = []
for line in tqdm(val_res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
val_data = np.array(ff)

ff = []
for line in tqdm(restest):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
test_data = np.array(ff)

ff = []
for line in tqdm(res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
train_data = np.array(ff)

In [None]:
print(classification_report(train_out, train_data))

In [None]:
print(classification_report(val_out, val_data))

In [None]:
test['rating'] = test_data

id = test['review_id'].to_numpy()
rating = test['rating'].to_numpy()
df = pd.DataFrame( columns=['review_id', 'rating'])
df['review_id'] = id
df['rating'] = rating

In [None]:
df.to_csv('submission_unet5_embedding_class_weights_model.csv',index=False)

In [None]:
model.model.save('unet5')