In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from Model import *
from sklearn.metrics import classification_report
import tensorflow_addons as tfa
from keras.utils import io_utils

In [3]:
train = pd.read_csv("../../dataset/goodreads_train.csv")
test = pd.read_csv("../../dataset/goodreads_test.csv")
vocabulary = np.load('../../vocabulaires/voc_without_std_word_count_5.npy', allow_pickle=True)
train['review_text'] = train['review_text'].str.replace('[^\w\s]','')

In [None]:
model_list = [mlp1]

def scheduler(epoch, lr):
    if epoch < 4:
        return lr
    else:
        return lr * tf.math.exp(-0.1)


seeds = [42,52]

for seed in seeds:
    keras.utils.set_random_seed(seed)
    for model_obj in model_list:

        model = model_obj.Model(vocabulary)
        model.model.compile(optimizer=keras.optimizers.Adamax(learning_rate=0.001),
                           loss=keras.losses.categorical_crossentropy,
                           metrics=[keras.metrics.categorical_accuracy, tfa.metrics.F1Score(num_classes=6, average='weighted')]
                           )
        print(model.name)
        print(model.model.summary())

        chekpoint = keras.callbacks.ModelCheckpoint(f'checkpoint/{model.name}/', save_weights_only=True,
        monitor='val_f1_score',
        mode='max',
        save_best_only=True)

        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{model.name}")

        if not os.path.exists(f"logs/{model.name}"):
            os.mkdir(f"logs/{model.name}")
        if not os.path.exists(f"checkpoint/{model.name}"):
            os.mkdir(f"checkpoint/{model.name}")




        model.run_experiment([train['review_text'], train['n_comments'], train['n_votes']], train['rating'], epochs=8, callbacks=[keras.callbacks.LearningRateScheduler(scheduler,0),chekpoint, tensorboard_callback], batch_size=60, validation_split=0.2)

unet3
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 512)         0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 embedding (Embedding)          (None, 512, 300)     32849700    ['text_vectorization[0][0]']     
                                                                                                  
 conv1d (Conv1D)                (None, 512, 64)      57664       ['embedding[0][0]']    

In [9]:
model.model.load_weights('checkpoint/unet1')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1c3d44bac50>

In [10]:
res = model.model.predict([train['review_text'], train['n_comments'], train['n_votes']])



In [11]:
restest = model.model.predict([test['review_text'], test['n_comments'], test['n_votes']])



In [12]:
ff = []
for line in tqdm(restest):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
test_data = np.array(ff)

ff = []
for line in tqdm(res):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
train_data = np.array(ff)

  0%|          | 0/478033 [00:00<?, ?it/s]

  0%|          | 0/900000 [00:00<?, ?it/s]

In [13]:
print(classification_report(train['rating'], train_data))

              precision    recall  f1-score   support

           0       0.67      0.31      0.42     30988
           1       0.53      0.43      0.48     28718
           2       0.57      0.50      0.53     72627
           3       0.65      0.65      0.65    188972
           4       0.64      0.71      0.67    313688
           5       0.75      0.73      0.74    265007

    accuracy                           0.67    900000
   macro avg       0.63      0.56      0.58    900000
weighted avg       0.67      0.67      0.66    900000



In [14]:
test['rating'] = test_data

id = test['review_id'].to_numpy()
rating = test['rating'].to_numpy()
df = pd.DataFrame( columns=['review_id', 'rating'])
df['review_id'] = id
df['rating'] = rating

In [15]:
df.to_csv('submission_unet2_embedding_class_weights_model.csv',index=False )


In [16]:
model.model.save('unet2')



INFO:tensorflow:Assets written to: unet2\assets


INFO:tensorflow:Assets written to: unet2\assets
