In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from Model import *
from sklearn.metrics import classification_report
import tensorflow_addons as tfa
from keras.utils import io_utils
from pandarallel import pandarallel
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import keras_nlp
pandarallel.initialize(progress_bar=True, nb_workers=16)
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
# train['review_text'] = train['review_text'].str.replace('[^\w\s]','')
train = shuffle(train)

In [3]:
r = tf.convert_to_tensor(train['review_text'].values)
re =tf.data.Dataset.from_tensor_slices(r)

In [4]:
#cpu
# with tf.device('/cpu:0'):
#     voc = keras_nlp.tokenizers.compute_word_piece_vocabulary(
#         re,
#         50000,
#         vocabulary_output_file="word_piece_vocabulary",
#         lowercase=False,
#         strip_accents=False,
#         split=True,
#         split_on_cjk=True,
#         suffix_indicator="##",
#         reserved_tokens=["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]"],
#     )

In [5]:
def convert_timestamp(x):
    import pandas as pd
    if pd.isna(x):#parallel_apply
        return 0.0
    else:
        try:
            return float(pd.Timestamp(x).value / 10**18)
        except:
            return 0
train[['read_at','date_added','date_updated' ,'started_at']] = train[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=225000), Label(value='0 / 225000')â€¦

In [6]:
inputs_data = train[['review_text','n_comments', 'n_votes','read_at','date_added','date_updated','started_at']]
outputs_data = keras.utils.to_categorical(train['rating'], num_classes=6)
train_in, validation_in, train_out, validation_out = train_test_split(inputs_data, outputs_data, test_size=0.2)

In [7]:
train_in = [train_in['review_text'], train_in['n_comments'], train_in['n_votes'], train_in['read_at'], train_in['date_added'], train_in['date_updated'], train_in['started_at']]
validation_in = [validation_in['review_text'], validation_in['n_comments'], validation_in['n_votes'], validation_in['read_at'], validation_in['date_added'], validation_in['date_updated'], validation_in['started_at']]

In [29]:
model_list = [transformer_classifier1]

def scheduler(epoch, lr):
    if epoch < 1:
        return lr
    else:
        return lr * tf.math.exp(-0.15)

seeds = [42]
for seed in seeds:
    keras.utils.set_random_seed(seed)
    for model_obj in model_list:
        model = model_obj.Model(vocabulary)

        model.model.compile(optimizer=keras.optimizers.Adamax(learning_rate=0.0001),
                           loss=keras.losses.categorical_crossentropy,
                           metrics=[keras.metrics.categorical_accuracy, tfa.metrics.F1Score(num_classes=6, average='weighted')]
                           )
        print(model.name)
        print(model.model.summary())
        if not os.path.exists(f"logs/{model.name}"):
            os.mkdir(f"logs/{model.name}")
        if not os.path.exists(f"checkpoint/{model.name}"):
            os.mkdir(f"checkpoint/{model.name}")
        chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/{model.name}/', save_weights_only=True,
        monitor='val_f1_score',
        mode='max',
        save_best_only=True)
        sheduler = keras.callbacks.LearningRateScheduler(scheduler,0)
        terminateOnNan = keras.callbacks.TerminateOnNaN()
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{model.name}")
        model.run_experiment(train_in, train_out, validation_in, validation_out, epochs=10, callbacks=[chekpoint, sheduler, terminateOnNan], batch_size=64)

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_23 (InputLayer)          [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, None, None,   0           ['input_23[0][0]']               
                                3)                                                                
                                                                                                  
 conv1_conv (Conv2D)            (None, None, None,   9472        ['conv1_pad[0][0]']              
                                64)                                                        

In [33]:
model.model.load_weights('checkpoint/unet1')

<keras.engine.input_layer.InputLayer at 0x25bd9266e90>

In [None]:
res = model.model.predict([train['review_text'], train['n_comments'], train['n_votes']])

In [None]:
restest = model.model.predict([test['review_text'], test['n_comments'], test['n_votes']])

In [None]:
ff = []
for line in tqdm(restest):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
test_data = np.array(ff)

ff = []
for line in tqdm(res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
train_data = np.array(ff)

In [None]:
print(classification_report(train['rating'], train_data))

In [None]:
test['rating'] = test_data

id = test['review_id'].to_numpy()
rating = test['rating'].to_numpy()
df = pd.DataFrame( columns=['review_id', 'rating'])
df['review_id'] = id
df['rating'] = rating

In [None]:
df.to_csv('submission_unet2_embedding_class_weights_model.csv',index=False )


In [None]:
model.model.save('unet2')