In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from Model import *
from sklearn.metrics import classification_report
import tensorflow_addons as tfa
from keras.utils import io_utils
from pandarallel import pandarallel
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
pandarallel.initialize(progress_bar=True, nb_workers=16)
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
train['review_text'] = train['review_text'].str.replace('[^\w\s]','')
train = shuffle(train)

  train['review_text'] = train['review_text'].str.replace('[^\w\s]','')


In [3]:
def convert_timestamp(x):
    import pandas as pd
    if pd.isna(x):#parallel_apply
        return 0.0
    else:
        try:
            return float(pd.Timestamp(x).value / 10**18)
        except:
            return 0
train[['read_at','date_added','date_updated' ,'started_at']] = train[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=225000), Label(value='0 / 225000')…

In [4]:
inputs_data = train[['review_text','n_comments', 'n_votes','read_at','date_added','date_updated','started_at']]
outputs_data = keras.utils.to_categorical(train['rating'], num_classes=6)
train_in, validation_in, train_out, validation_out = train_test_split(inputs_data, outputs_data, test_size=0.2)

In [5]:
train_in = [train_in['review_text'], train_in['n_comments'], train_in['n_votes'], train_in['read_at'], train_in['date_added'], train_in['date_updated'], train_in['started_at']]
validation_in = [validation_in['review_text'], validation_in['n_comments'], validation_in['n_votes'], validation_in['read_at'], validation_in['date_added'], validation_in['date_updated'], validation_in['started_at']]

In [7]:

def scheduler(epoch, lr):
    if epoch < 1:
        return lr
    else:
        return lr * tf.math.exp(-0.22)
sheduler = keras.callbacks.LearningRateScheduler(scheduler,0)

seeds = [42]
for seed in seeds:
    keras.utils.set_random_seed(seed)
    model = resnet1.ResNet50(vocabulary)
    chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/resnet1/', save_weights_only=True,
        monitor='val_f1_score',
        mode='max',
        save_best_only=True)
    if not os.path.exists(f"logs/{model.name}"):
            os.mkdir(f"logs/{model.name}")
    if not os.path.exists(f"checkpoint/{model.name}"):
        os.mkdir(f"checkpoint/{model.name}")
    model.compile(optimizer=keras.optimizers.Adamax(learning_rate=0.0001),
                               loss=keras.losses.categorical_crossentropy,
                               metrics=[keras.metrics.categorical_accuracy,
                                        tfa.metrics.F1Score(num_classes=6, average='weighted')
                                        ])

    model.fit(train_in, train_out, validation_data=(validation_in, validation_out), epochs=10, batch_size=64, callbacks=[chekpoint, sheduler])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2529e1d9690>

In [9]:
res = model.predict(train_in)



In [10]:
val_res = model.predict(validation_in)



In [13]:
val_out = np.argmax(validation_out, axis=1)
train_out = np.argmax(train_out, axis=1)


In [14]:
test[['read_at','date_added','date_updated' ,'started_at']] = test[['read_at','date_added','date_updated' ,'started_at']].parallel_applymap(convert_timestamp)

test_data = [test['review_text'], test['n_comments'], test['n_votes'], test['read_at'], test['date_added'], test['date_updated'], test['started_at']]
restest = model.model.predict(test_data)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=119512), Label(value='0 / 119512')…



In [15]:
ff = []
for line in tqdm(val_res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
val_data = np.array(ff)

ff = []
for line in tqdm(restest):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
test_data = np.array(ff)

ff = []
for line in tqdm(res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
train_data = np.array(ff)

  0%|          | 0/180000 [00:00<?, ?it/s]

  0%|          | 0/478033 [00:00<?, ?it/s]

  0%|          | 0/720000 [00:00<?, ?it/s]

In [16]:
print(classification_report(train_out, train_data))

              precision    recall  f1-score   support

           0       0.73      0.44      0.55     24800
           1       0.55      0.34      0.42     22993
           2       0.55      0.52      0.53     58041
           3       0.64      0.65      0.64    151064
           4       0.63      0.72      0.67    250900
           5       0.76      0.70      0.73    212202

    accuracy                           0.66    720000
   macro avg       0.64      0.56      0.59    720000
weighted avg       0.66      0.66      0.66    720000



In [17]:
print(classification_report(val_out, val_data))

              precision    recall  f1-score   support

           0       0.66      0.40      0.49      6188
           1       0.51      0.31      0.38      5725
           2       0.50      0.47      0.48     14586
           3       0.58      0.59      0.58     37908
           4       0.58      0.68      0.63     62788
           5       0.72      0.66      0.69     52805

    accuracy                           0.61    180000
   macro avg       0.59      0.51      0.54    180000
weighted avg       0.62      0.61      0.61    180000



In [18]:
test['rating'] = test_data

id = test['review_id'].to_numpy()
rating = test['rating'].to_numpy()
df = pd.DataFrame( columns=['review_id', 'rating'])
df['review_id'] = id
df['rating'] = rating

In [19]:
df.to_csv('submission_unet5_embedding_class_weights_model.csv',index=False)

In [16]:
model.model.save('unet5')



INFO:tensorflow:Assets written to: unet5\assets


INFO:tensorflow:Assets written to: unet5\assets
