In [7]:
!pip install tensorflow kaggle numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
# Import des bibliothèques nécessaires
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd

In [9]:
# Chargement des datasets
train_data = pd.read_csv("/Users/nathansornet/Documents/Cours/INGE_3/ESME/NLP/TP1_LSTM/cnn_dailymail/train.csv")
val_data = pd.read_csv("/Users/nathansornet/Documents/Cours/INGE_3/ESME/NLP/TP1_LSTM/cnn_dailymail/validation.csv")
test_data = pd.read_csv("/Users/nathansornet/Documents/Cours/INGE_3/ESME/NLP/TP1_LSTM/cnn_dailymail/test.csv")

In [10]:
# Préparer les données pour chaque ensemble
def prepare_data(data, tokenizer=None, fit_tokenizer=False):
    articles = data['article'].values
    summaries = data['highlights'].values
    
    if fit_tokenizer:
        tokenizer.fit_on_texts(articles)

    # Convertir les articles et les résumés en séquences
    articles_sequences = tokenizer.texts_to_sequences(articles)
    summaries_sequences = tokenizer.texts_to_sequences(summaries)
    
    # Compléter les séquences pour les rendre de longueur égale
    articles_padded = pad_sequences(articles_sequences, maxlen=max_len, padding='post')
    summaries_padded = pad_sequences(summaries_sequences, maxlen=max_len, padding='post')
    
    # Créer les étiquettes pour chaque mot (moment fort / non-moment fort)
    labels = np.zeros_like(articles_padded)
    for i, summary in enumerate(summaries_padded):
        for word in summary:
            if word != 0:
                labels[i, np.where(articles_padded[i] == word)] = 1
    
    return articles_padded, labels

In [11]:
# Définir les paramètres du modèle
vocab_size = 20000  # Nombre maximum de mots à garder
max_len = 100  # Longueur maximale des séquences
embedding_dim = 64

In [12]:

# Créer le tokenizer et préparer les données
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
X_train, y_train = prepare_data(train_data, tokenizer=tokenizer, fit_tokenizer=True)
X_val, y_val = prepare_data(val_data, tokenizer=tokenizer)
X_test, y_test = prepare_data(test_data, tokenizer=tokenizer)

In [13]:
# Construction du modèle LSTM
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])




In [14]:
# Entraînement du modèle avec validation
batch_size = 32
epochs = 5
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Entraînement du modèle avec validation et early stopping
model.fit(X_train[:5000], y_train[:5000],  # Utilisez un sous-échantillon des données (par exemple, 5000 échantillons)
          epochs=epochs,
          batch_size=batch_size,
          validation_data=(X_val, y_val),
          callbacks=[early_stopping])

Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.7532 - loss: 0.5472 - val_accuracy: 0.8135 - val_loss: 0.4360
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 75ms/step - accuracy: 0.8188 - loss: 0.4183 - val_accuracy: 0.8135 - val_loss: 0.4324
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 75ms/step - accuracy: 0.8206 - loss: 0.4014 - val_accuracy: 0.8112 - val_loss: 0.4382
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 77ms/step - accuracy: 0.8241 - loss: 0.3904 - val_accuracy: 0.8105 - val_loss: 0.4442


<keras.src.callbacks.history.History at 0x3626a5ee0>

In [15]:

# Évaluation du modèle sur l'ensemble de test
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8155 - loss: 0.4285
Loss: 0.42816779017448425, Accuracy: 0.8156216144561768


In [16]:
# Exemple d'utilisation du modèle
test_article = "Texte d'exemple pour identifier les moments forts."
test_sequence = tokenizer.texts_to_sequences([test_article])
test_padded = pad_sequences(test_sequence, maxlen=max_len, padding='post')

In [17]:
predictions = model.predict(test_padded)
predicted_moments = (predictions > 0.5).astype(int)

print("Moments forts prédits:", predicted_moments)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
Moments forts prédits: [[[1]
  [1]
  [0]
  [1]
  [0]
  [0]
  [1]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]
  [0]]]


In [18]:
# Exemple d'article à tester
test_article = "Example text to identify highlights. This project uses machine learning to extract key information from texts. Highlights represent the most important sentences or parts."

# Convertir le texte d'exemple en séquence
test_sequence = tokenizer.texts_to_sequences([test_article])
test_padded = pad_sequences(test_sequence, maxlen=max_len, padding='post')

# Prédire les moments forts
predictions = model.predict(test_padded)[0]
predicted_moments = (predictions > 0.5).astype(int)  # Moments forts là où la probabilité est > 0.5

# Extraire et afficher les moments forts
tokenizer.index_word = {index: word for word, index in tokenizer.word_index.items() if index < vocab_size}

# Reconstruire les mots de l'article et identifier les moments forts
highlighted_text = []
for i, pred in enumerate(predicted_moments):
    if pred == 1 and test_padded[0][i] != 0:
        highlighted_text.append(tokenizer.index_word[test_padded[0][i]])
    elif test_padded[0][i] != 0:
        highlighted_text.append(f"({tokenizer.index_word[test_padded[0][i]]})")

highlighted_text = ' '.join(highlighted_text)
print("Texte avec moments forts :", highlighted_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Texte avec moments forts : (example) (text) to (identify) (highlights) (this) (project) (uses) (machine) (learning) to (extract) (key) (information) (from) (texts) (highlights) (represent) the (most) (important) (sentences) (or) (parts)
