---
# Model permettant de classifier des reviews de film, avec kfold
---

# IMDB dataset importing

In [17]:
# Libraries importing
from keras.datasets import imdb
import tensorflow as tf

In [18]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 4000)
# num_words = dictionnary len

In [19]:
# 0 = pad token
# 1 = beginning character of the review
# 2 = words that aren't in the dictionnary : "oov_char" parameter

# Dataset processing

In [20]:
# Definition of the review structure in 200 characters
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen = 200)
x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen = 200)

In [21]:
x_train_padded.shape, x_test_padded.shape

((25000, 200), (25000, 200))

In [22]:
# Defining validation and test data
x_data_val, y_data_val = x_test_padded[:int(len(x_test_padded)/2)], y_test[:int(len(x_test_padded)/2)]
x_data_test, y_data_test = x_test_padded[int(len(x_test_padded)/2):], y_test[int(len(x_test_padded)/2):]

In [23]:
x_data_val.shape, x_data_test.shape

((12500, 200), (12500, 200))

# Definition of the model

In [24]:
# Libraries importing
# Keras layers
from keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Concatenate

# Keras model
from keras.models import Model, load_model

# Pour obtenir une image png du modèle
from keras.utils import plot_model

# Optimizers
from keras.optimizers import Adam

# Loss functions
from keras.losses import BinaryCrossentropy

# Metrics
from keras.metrics import BinaryAccuracy

# Kfold
from sklearn.model_selection import StratifiedKFold

# Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Numpy
import numpy as np

In [None]:
# Dimension de l'espace ou seront projetés les tokens par l'embedding_layer
token_projection_dim = 32


# Fonction permettant de définir le modèle

def create_model(X, y, X_v, y_v, count) :

    # Définition des fonctions utilisées dans le callbacks
    callbacks = [
        # Fonction qui permet de stopper le modèle si la binary accuracy n'augmente pas après 2 epochs pour les données de validations
        EarlyStopping(monitor = "val_binary_accuracy", patience = 2, mode = 'max'),
        # Fonction qui permet d'enregistrer le meilleur modèle dans les fichiers
        ModelCheckpoint(
            filepath = "model/my_best_model" + str(count) + ".keras",
            monitor = "val_binary_accuracy",
            mode = "max",
            save_best_only = True,
            verbose = 1
        )
    ]

    input_layer = Input(shape = [200,], dtype = 'int64')

    embedding_layer = Embedding(input_dim = 4000, input_length = 200, output_dim = token_projection_dim)(input_layer)


    # Transformers

    # Couche d'attention
    MHA_layer_T = MultiHeadAttention(num_heads = 2, dropout = 0.1, key_dim = token_projection_dim)(embedding_layer, embedding_layer, embedding_layer)

    normalization_layer1_T = LayerNormalization(epsilon = 1e-6)(embedding_layer + MHA_layer_T)

    dense_layer1_T = Dense(token_projection_dim, activation = "tanh")(normalization_layer1_T)

    dense_layer2_T = Dense(token_projection_dim, activation = "tanh")(dense_layer1_T)

    normalization_layer2_T = LayerNormalization(epsilon = 1e-6)(normalization_layer1_T + dense_layer2_T)

    flatten_layer_T = Flatten()(normalization_layer2_T)


    # LSTM

    LSTM_layer_LSTM = LSTM(5)(embedding_layer)

    dropout_layer_LSTM = Dropout(0.5)(LSTM_layer_LSTM)


    # CNN

    conv_layer_CNN = Conv1D(16, 3, input_shape = (200, 64))(embedding_layer)

    dropout_layer_CNN = Dropout(0.5)(conv_layer_CNN)

    pooling_layer_CNN = MaxPooling1D(2)(dropout_layer_CNN)

    flatten_layer_CNN = Flatten()(pooling_layer_CNN)


    # Concatenate

    concatenate_layer = Concatenate()([flatten_layer_T, dropout_layer_LSTM, flatten_layer_CNN])


    # Output

    output_layer = Dense(1, activation = "sigmoid")(concatenate_layer)

    model = Model(inputs = input_layer, outputs = output_layer)

    model.compile(optimizer = Adam(learning_rate = 0.001), loss = BinaryCrossentropy(), metrics = [BinaryAccuracy()])

    # Model fitting
    model.fit(X, y, batch_size = 32, epochs = 3, validation_data = (X_v, y_v), callbacks = callbacks)

    return model

# Kfold validation

In [34]:
# split data into 3 folds
n_folds = 3
kfold = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 0)

# Compteur itératif
i = 0

# Liste contenant les historiques des entrainements des modèles
history_list = []

for train_index, test_index in kfold.split(x_train_padded, y_train) :
    # create model
    model = create_model(x_train_padded[train_index], y_train[train_index], x_train_padded[test_index], y_train[test_index], i)
    i += 1

Epoch 1/3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m520/521[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 53ms/step - binary_accuracy: 0.5767 - loss: 0.7537
Epoch 1: val_binary_accuracy improved from -inf to 0.82217, saving model to model/my_best_model0.keras
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 60ms/step - binary_accuracy: 0.5770 - loss: 0.7531 - val_binary_accuracy: 0.8222 - val_loss: 0.3809
Epoch 2/3
[1m520/521[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 48ms/step - binary_accuracy: 0.9163 - loss: 0.2070
Epoch 2: val_binary_accuracy improved from 0.82217 to 0.83093, saving model to model/my_best_model0.keras
[1m521/521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 54ms/step - binary_accuracy: 0.9163 - loss: 0.2070 - val_binary_accuracy: 0.8309 - val_loss: 0.4050
Epoch 3/3
[1m520/521[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 48ms/step - binary_accuracy: 0.9801 - loss: 0.0676
Epoch 3: val_binary_accuracy improved from 0.83093 to 0.84833, saving model to model/m

On récupère l'enmseble des meilleurs modèles dans une liste

In [35]:
all_model = []
for i in range (n_folds) :
    all_model.append(load_model("model/my_best_model" + str(i) + ".keras"))

# Prédiction sur les données de tests avec les modèles chargés

In [36]:
model_predict_moyenne = all_model[0].predict(x_data_test)

for i in range(1, n_folds) :
    model_predict_moyenne += all_model[i].predict(x_data_test)

model_predict_moyenne = np.round(model_predict_moyenne/n_folds)

print("accuracy : ", 1 - (np.sum(np.abs(model_predict_moyenne[:, 0] - y_data_test))/len(y_data_test)))

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step
accuracy :  0.87016


On obtient une accuracy de 87% sur les données de test, ce qui est supérieur à l'accuracy obtenu sans kfold, on peut donc en déduire que la kfold permet d'obtenir un meilleur modèle