In [32]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.callbacks import TensorBoard
#import tensorflow_addons as tfa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from time import time
import os
from sklearn.utils.class_weight import compute_class_weight
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [33]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [34]:
#read train dataset
tf.random.set_seed(5)
print("open dataset")
train = pd.read_csv("../dataset/goodreads_train.csv")

open dataset


In [35]:
# Load a np_archive of review_text col of train dataset preprocess in main.py
train_prepro = pd.DataFrame(data=np.load(file="../vocabulaires/prepro_train_archive_PN_less.npy", allow_pickle=True), columns=['review_text'])['review_text']


In [36]:
# add review text col to train dataset
train['review_text'] = train_prepro

In [37]:
l2_rate = 0.00001
dropout_rate = 0.01
rating = keras.utils.to_categorical(train['rating'], num_classes=6)
#create model layers


inputs = keras.Input(shape=(1,), dtype=tf.string) # text
inputs2 = keras.Input(shape=1, dtype=tf.float32) # n_comment
inputs3 = keras.Input(shape=1, dtype=tf.float32) # n_votes
#create vectorize layer, to transform words in integer
vectorize_layer = keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    output_mode='int',
    output_sequence_length=1400,
    vocabulary=np.load('../vocabulaires/voc_lemm_without_NP.npy')
    )(inputs)


embedding = keras.layers.Embedding(453118, 64, batch_size=1000, embeddings_regularizer=keras.regularizers.l2(l2_rate))(vectorize_layer)


conv1 = keras.layers.Conv1D(32, 3, activation=keras.activations.relu, padding='same')(embedding)
"""pooling1 = keras.layers.MaxPooling1D(pool_size=5,padding="valid")(conv1)

conv2 = keras.layers.Conv1D(64, 9, activation=keras.activations.relu, padding='same')(pooling1)
pooling2 = keras.layers.MaxPooling1D(pool_size=10,padding="valid")(conv2)"""


flatten = keras.layers.Flatten()(conv1)


layer1 = keras.layers.Dense(128, activation=keras.activations.relu)(flatten)

layer2 = keras.layers.Dense(64, activation=keras.activations.relu)(layer1)

layer3 = keras.layers.Dense(32, activation=keras.activations.relu)(layer2)

layer4 = keras.layers.Dense(16, activation=keras.activations.relu)(layer3)

#layer3 = keras.layers.Dense(300, activation=tf.keras.activations.tanh,kernel_regularizer=tf.keras.regularizers.l2(l2_rate),bias_regularizer=tf.keras.regularizers.l2(l2_rate))(drop2)
#drop3 = keras.layers.Dropout(dropout_rate)(layer3)
conc = keras.layers.concatenate([layer4, inputs2,inputs3])
outputs = keras.layers.Dense(6, activation=keras.activations.sigmoid)(conc)

In [38]:
model = keras.Model(inputs=[inputs, inputs2, inputs3], outputs=outputs, name="mnist_model")
tensorboard = TensorBoard(log_dir="../logs/pmc8_3".format(time()))

In [42]:
#for learning_rate in learning_rates:
model.compile(optimizer=keras.optimizers.SGD(learning_rate=0.0001, momentum=0.5),
              loss=keras.losses.binary_crossentropy,
              metrics=[
                  keras.metrics.categorical_accuracy
                       ]
              )
model.summary()

Model: "mnist_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_3 (TextVect  (None, 1400)        0           ['input_10[0][0]']               
 orization)                                                                                       
                                                                                                  
 embedding_3 (Embedding)        (None, 1400, 64)     28999552    ['text_vectorization_3[0][0]']   
                                                                                                  
 conv1d_1 (Conv1D)              (None, 1400, 32)     6176        ['embedding_3[0][0]']  

In [43]:
class_weights = compute_class_weight(class_weight='balanced',classes= np.unique(train['rating']), y = train['rating'])
di = {}
for i in range(len(class_weights)):
    di[i] = class_weights[i]
#model.save("../models_trained/pmc8_3")

In [44]:
model.fit([train['review_text'], train['n_comments'], train['n_votes']], rating, epochs=2,
                  callbacks=[
                      tf.keras.callbacks.TensorBoard(log_dir="../logs/pmc8_3"),
                  ],
                  batch_size=1000, shuffle=True, validation_split=0.1, class_weight=di
                  )

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x243914c3070>

In [None]:
#model.save("../models_trained/pmc8_3_train")

In [51]:
#read test dataset
test = pd.read_csv("../dataset/goodreads_test.csv")

In [52]:
# Load a np_archive of review_text col of train dataset preprocess in main.py
test_prepro = pd.DataFrame(data=np.load(file="../vocabulaires/prepro_test_archive_NEG.csv.npy", allow_pickle=True), columns=['review_text'])['review_text']
test['review_text'] = test_prepro

In [53]:
# test the model with test dataset

In [54]:
restest = model.predict([test['review_text'], test['n_comments'], test['n_votes']])



In [18]:
# reverse keras.utils.to_categorical for kaggle submission
ff = []
for line in tqdm(restest):
    tmp = -2
    category = None
    for i in (range(6)):
        if line[i] > tmp:
            category = i
            tmp = line[i]
    ff.append(category)
data = np.array(ff)

  0%|          | 0/478033 [00:00<?, ?it/s]

In [19]:
test['rating'] = data

In [20]:
data

array([4, 5, 3, ..., 4, 4, 5])

In [21]:
# create a dataframe for kaggle
id = test['review_id'].to_numpy()
rating = test['rating'].to_numpy()
df = pd.DataFrame( columns=['review_id', 'rating'])

In [22]:
df['review_id'] = id
df['rating'] = rating

In [23]:
# create a csv for submission
df.to_csv('pmc9_model.csv',index=False)