In [1]:
# Import des librairies
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Charger les données
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

In [3]:
# Obtenir le mapping inverse des index vers les mots
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [4]:
# Convertir les séquences d'indices en textes
x_train_texts = [' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in x_train]
x_test_texts = [' '.join([reverse_word_index.get(i - 3, '?') for i in sequence]) for sequence in x_test]

In [5]:
# Appliquer la tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train_texts)
x_train_sequences = tokenizer.texts_to_sequences(x_train_texts)
x_test_sequences = tokenizer.texts_to_sequences(x_test_texts)

In [6]:
# Assurer que toutes vos séquences ont la même longueur
max_len = 100
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_len)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_len)

In [7]:
# Charger les embeddings GloVe
embeddings_index = {}
glove_path = 'glove.6B.50d.txt'  # Assurez-vous de spécifier le bon chemin
with open(glove_path, encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [8]:
# Créer la matrice d'embedding
num_words = min(10000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, 50))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [9]:
# Construire le modèle avec une couche d'embedding
model = Sequential()
model.add(Embedding(num_words, 50, input_length=max_len, weights=[embedding_matrix], trainable=True))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [10]:
# Compiler le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
# Entraîner le modèle
model.fit(x_train_padded, np.array(y_train), epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x243dbcd1190>

In [16]:
# Évaluer le modèle sur les données de test
y_pred_prob = model.predict(x_test_padded)
y_pred = np.argmax(y_pred_prob, axis=1)



In [18]:
# Évaluer le modèle sur les données de test
y_pred_prob = model.predict(x_test_padded)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convertir les probabilités en classes binaires (0 ou 1)

# Afficher la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matrice de confusion:")
print(conf_matrix)

# Calculer et afficher le score de précision
acc_score = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {acc_score}")

Matrice de confusion:
[[10078  2422]
 [ 2350 10150]]
Accuracy Score: 0.80912
