---
# Model permettant de classifier des reviews de film, sans kfold
---

# IMDB dataset importing

In [1]:
# Libraries importing
from keras.datasets import imdb
import tensorflow as tf

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 4000)
# num_words = dictionnary len

In [3]:
# 0 = pad token
# 1 = beginning character of the review
# 2 = words that aren't in the dictionnary : "oov_char" parameter

# Dataset processing

In [4]:
# Definition of the review structure in 200 characters
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen = 200)
x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen = 200)

In [5]:
x_train_padded.shape, x_test_padded.shape

((25000, 200), (25000, 200))

In [6]:
# Defining validation and test data
x_data_val, y_data_val = x_test_padded[:int(len(x_test_padded)/2)], y_test[:int(len(x_test_padded)/2)]
x_data_test, y_data_test = x_test_padded[int(len(x_test_padded)/2):], y_test[int(len(x_test_padded)/2):]

In [7]:
x_data_val.shape, x_data_test.shape

((12500, 200), (12500, 200))

# Definition of the model

In [8]:
# Libraries importing
# Keras layers
from keras.layers import Input, Embedding, MultiHeadAttention, LayerNormalization, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Concatenate

# Keras model
from keras.models import Model

# Pour obtenir une image png du modèle
from keras.utils import plot_model

# Optimizers
from keras.optimizers import Adam

# Loss functions
from keras.losses import BinaryCrossentropy

# Metrics
from keras.metrics import BinaryAccuracy

In [9]:
# Dimension de l'espace ou seront projetés les tokens par l'embedding_layer
token_projection_dim = 32


input_layer = Input(shape = [200,], dtype = 'int64')

embedding_layer = Embedding(input_dim = 4000, input_length = 200, output_dim = token_projection_dim)(input_layer)


# Transformers

# Couche d'attention
MHA_layer_T = MultiHeadAttention(num_heads = 2, dropout = 0.1, key_dim = token_projection_dim)(embedding_layer, embedding_layer, embedding_layer)

normalization_layer1_T = LayerNormalization(epsilon = 1e-6)(embedding_layer + MHA_layer_T)

dense_layer1_T = Dense(token_projection_dim, activation = "tanh")(normalization_layer1_T)

dense_layer2_T = Dense(token_projection_dim, activation = "tanh")(dense_layer1_T)

normalization_layer2_T = LayerNormalization(epsilon = 1e-6)(normalization_layer1_T + dense_layer2_T)

flatten_layer_T = Flatten()(normalization_layer2_T)


# LSTM

LSTM_layer_LSTM = LSTM(5)(embedding_layer)

dropout_layer_LSTM = Dropout(0.5)(LSTM_layer_LSTM)


# CNN

conv_layer_CNN = Conv1D(16, 3, input_shape = (200, token_projection_dim))(embedding_layer)

dropout_layer_CNN = Dropout(0.5)(conv_layer_CNN)

pooling_layer_CNN = MaxPooling1D(2)(dropout_layer_CNN)

flatten_layer_CNN = Flatten()(pooling_layer_CNN)


# Concatenate

concatenate_layer = Concatenate()([flatten_layer_T, dropout_layer_LSTM, flatten_layer_CNN])


# Output

output_layer = Dense(1, activation = "sigmoid")(concatenate_layer)

model = Model(inputs = input_layer, outputs = output_layer)

model.compile(optimizer = Adam(learning_rate = 0.001), loss = BinaryCrossentropy(), metrics = [BinaryAccuracy()])

# Model fitting
history = model.fit(x_train_padded, y_train, batch_size = 32, epochs = 3, validation_data = (x_data_val, y_data_val))



Epoch 1/3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 77ms/step - binary_accuracy: 0.6360 - loss: 0.6727 - val_binary_accuracy: 0.8561 - val_loss: 0.3320
Epoch 2/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 86ms/step - binary_accuracy: 0.9211 - loss: 0.2006 - val_binary_accuracy: 0.8301 - val_loss: 0.4144
Epoch 3/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 87ms/step - binary_accuracy: 0.9795 - loss: 0.0705 - val_binary_accuracy: 0.8435 - val_loss: 0.4600


On remarque qu'au bout d'une epoch, l'accuracy pour les données de validations était déja de 85%, et elle n'a pas augmenté au long de l'apprentissage. En revanche, l'accuracy pour les données d'entrainement augmente logiquement tout au long de l'apprentissage.
De plus, on remarque que la loss pour les données de validation augmente au long de l'apprentissage : on fait face à du sur-apprentissage, il est donc nécessaire de ne pas augmenté le nombre d'epochs.

Je n'ai pas tracé de graphique pour afficher les courbes d'apprentissages en raison du faible nombre d'epoch, ce n'est pas très cohérent.

In [10]:
plot_model(model, to_file = "model.png", show_shapes = True);

# Testing the model with test data

In [11]:
model.evaluate(x_data_test, y_data_test)

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - binary_accuracy: 0.8512 - loss: 0.4346


[0.43928125500679016, 0.849839985370636]