<a href="https://colab.research.google.com/github/JulesLscx/M2_DeepLearning/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from sklearn.model_selection import train_test_split
import numpy as np
import os
import zipfile

### Configuration and Parameters

In [27]:
BATCH_SIZE = 128
EPOCHS = 30
LATENT_DIM = 256
DATA_PATH = '/content/fra.txt'
NUM_SAMPLES = 20000 # Limit dataset size for faster training

### Load and Preprocess Data

In [28]:
input_texts = []
target_texts = []

with open(DATA_PATH, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

for line in lines[:-1]:  # [:-1] pour ignorer la dernière ligne (souvent vide)
    parts = line.split("\t")
    if len(parts) >= 2:
        input_text = "\t" + parts[0] + "\n"
        target_text = "\t" + parts[1] + "\n"
        input_texts.append(input_text)
        target_texts.append(target_text)

# Limit the number of samples for faster training
input_texts = input_texts[:NUM_SAMPLES]
target_texts = target_texts[:NUM_SAMPLES]

print(f"Nombre total d'échantillons : {len(input_texts)}")
print(f"Exemple d'entrée: {input_texts[0]}")
print(f"Exemple de cible: {target_texts[0].strip()}")

Nombre total d'échantillons : 20000
Exemple d'entrée: 	Go.

Exemple de cible: Va !


### Data Generator

In [29]:
input_characters = set()
target_characters = set()

for text in input_texts:
    for char in text:
        if char not in input_characters:
            input_characters.add(char)

for text in target_texts:
    for char in text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print(f"Nombre de tokens d'entrée uniques : {num_encoder_tokens}")
print(f"Nombre de tokens de sortie uniques : {num_decoder_tokens}")
print(f"Longueur max de séquence d'entrée : {max_encoder_seq_length}")
print(f"Longueur max de séquence de sortie : {max_decoder_seq_length}")

# Dictionnaires de tokenisation
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

# Dictionnaires inversés pour l'inférence
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

Nombre de tokens d'entrée uniques : 76
Nombre de tokens de sortie uniques : 102
Longueur max de séquence d'entrée : 21
Longueur max de séquence de sortie : 59


### Instantiate and Test Generator

In [30]:
input_train, input_val, target_train, target_val = train_test_split(
    input_texts, target_texts, test_size=0.2, random_state=42
)

print(f"Taille du set d'entraînement: {len(input_train)}")
print(f"Taille du set de validation: {len(input_val)}")

Taille du set d'entraînement: 16000
Taille du set de validation: 4000


In [31]:
def data_generator(input_texts_set, target_texts_set, batch_size):
    num_samples = len(input_texts_set)
    indices = np.arange(num_samples)

    while True:
        np.random.shuffle(indices)

        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            batch_indices = indices[start_idx:end_idx]
            current_batch_size = len(batch_indices)

            encoder_input_batch = np.zeros(
                (current_batch_size, max_encoder_seq_length, num_encoder_tokens), dtype="float32"
            )
            decoder_input_batch = np.zeros(
                (current_batch_size, max_decoder_seq_length, num_decoder_tokens), dtype="float32"
            )
            decoder_target_batch = np.zeros(
                (current_batch_size, max_decoder_seq_length, num_decoder_tokens), dtype="float32"
            )

            for i, data_idx in enumerate(batch_indices):
                input_text = input_texts_set[data_idx]
                target_text = target_texts_set[data_idx]

                for t, char in enumerate(input_text):
                    encoder_input_batch[i, t, input_token_index[char]] = 1.0

                for t, char in enumerate(target_text):
                    decoder_input_batch[i, t, target_token_index[char]] = 1.0
                    if t > 0:
                        decoder_target_batch[i, t - 1, target_token_index[char]] = 1.0

            yield ((encoder_input_batch, decoder_input_batch), decoder_target_batch)

In [32]:
# Définition de la signature de sortie du générateur
output_signature = (
    (
        tf.TensorSpec(shape=(None, max_encoder_seq_length, num_encoder_tokens), dtype=tf.float32),
        tf.TensorSpec(shape=(None, max_decoder_seq_length, num_decoder_tokens), dtype=tf.float32)
    ),
    tf.TensorSpec(shape=(None, max_decoder_seq_length, num_decoder_tokens), dtype=tf.float32)
)

# Création des datasets
train_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(input_train, target_train, BATCH_SIZE),
    output_signature=output_signature
).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_generator(
    lambda: data_generator(input_val, target_val, BATCH_SIZE),
    output_signature=output_signature
).prefetch(tf.data.AUTOTUNE)

# Calcul des étapes (steps)
steps_per_epoch = len(input_train) // BATCH_SIZE
validation_steps = len(input_val) // BATCH_SIZE

print(f"Steps per epoch: {steps_per_epoch}")
print(f"Validation steps: {validation_steps}")

Steps per epoch: 125
Validation steps: 31


In [33]:
# Encodeur
encoder_inputs = Input(shape=(None, num_encoder_tokens), name="encoder_input")
encoder_lstm = LSTM(LATENT_DIM, return_state=True, name="encoder_lstm")
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Décodeur
decoder_inputs = Input(shape=(None, num_decoder_tokens), name="decoder_input")
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(
    decoder_inputs, initial_state=encoder_states
)
decoder_dense = Dense(num_decoder_tokens, activation="softmax", name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

# Modèle d'entraînement
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

In [34]:
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# Lancer l'entraînement
print("Début de l'entraînement...")
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_ds,
    validation_steps=validation_steps,
    verbose=1
)

print("Entraînement terminé.")

Début de l'entraînement...
Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.0579 - loss: 1.3064 - val_accuracy: 0.0592 - val_loss: 1.1547
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0616 - loss: 1.1461 - val_accuracy: 0.0686 - val_loss: 1.1344
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0723 - loss: 1.1215 - val_accuracy: 0.0799 - val_loss: 1.1082
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.0857 - loss: 1.0928 - val_accuracy: 0.0958 - val_loss: 1.0581
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.0888 - loss: 1.0878 - val_accuracy: 0.0997 - val_loss: 1.0379
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.1028 - loss: 1.0214 - val_accuracy: 0.1150 - val_loss: 0.

In [35]:
# Modèle d'encodeur (inférence)
encoder_model = Model(encoder_inputs, encoder_states)
print("Modèle d'encodeur (inférence) créé.")

# Modèle de décodeur (inférence)
decoder_state_input_h = Input(shape=(LATENT_DIM,), name="decoder_state_h")
decoder_state_input_c = Input(shape=(LATENT_DIM,), name="decoder_state_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)
print("Modèle de décodeur (inférence) créé.")

Modèle d'encodeur (inférence) créé.
Modèle de décodeur (inférence) créé.


In [36]:
def encode_input_string(input_text):
    """Encode un string d'entrée en one-hot pour l'inférence."""
    seq = np.zeros(
        (1, max_encoder_seq_length, num_encoder_tokens), dtype="float32"
    )
    for t, char in enumerate(input_text):
        if t >= max_encoder_seq_length:
            break
        if char in input_token_index:
            seq[0, t, input_token_index[char]] = 1.0
    return seq

def decode_sequence(input_seq_one_hot):
    states_value = encoder_model.predict(input_seq_one_hot, verbose=0)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]

        decoded_sentence += sampled_char

        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        states_value = [h, c]

    return decoded_sentence.strip()

In [37]:
print("\n--- Test de la fonction de traduction ---")

# Utilisation du set de validation pour les tests
for _ in range(10):
    i = np.random.randint(0, len(input_val))

    input_text_test = input_val[i]
    target_text_test = target_val[i].strip()

    input_seq_one_hot = encode_input_string(input_text_test)
    translated_sentence = decode_sequence(input_seq_one_hot)

    print("-" * 30)
    print(f"Input:     {input_text_test}")
    print(f"Target:    {target_text_test}")
    print(f"Predicted: {translated_sentence}")


--- Test de la fonction de traduction ---
------------------------------
Input:     	I study English.

Target:    J'étudie l'anglais.
Predicted: J'ai de le parte.
------------------------------
Input:     	You have to go.

Target:    Tu dois y aller.
Predicted: Tu es trai de parte.
------------------------------
Input:     	I have returned.

Target:    Je suis revenu.
Predicted: J'ai de prentent.
------------------------------
Input:     	I do not feel sad.

Target:    Je ne suis pas triste.
Predicted: Je ne suis pas partente.
------------------------------
Input:     	I love my home.

Target:    J'adore mon chez-moi.
Predicted: J'ai de parte de parte.
------------------------------
Input:     	Can he do it?

Target:    Peut-il le faire ?
Predicted: Puis-te le parte.
------------------------------
Input:     	No one was there.

Target:    Personne n'était là.
Predicted: Personne ne te parte.
------------------------------
Input:     	Make your move.

Target:    Faites votre déplacemen

In [41]:
text_to_translate = "My name is Jules"

In [42]:
def translate(text_to_translate):
    input_seq_one_hot = encode_input_string(text_to_translate)
    translated_sentence = decode_sequence(input_seq_one_hot)
    return translated_sentence

In [43]:
translate(text_to_translate)

'Tu es le lais.'