In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
# STEP 1: Extract ZIP
import zipfile

with zipfile.ZipFile("fra-eng.zip", 'r') as zip_ref:
    zip_ref.extractall()

print("Extraction complete.")

Extraction complete.


In [21]:
# STEP 3: Load and Clean Data
def load_data(path='fra.txt', num_examples = 50000):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    
    pairs = [line.split('\t')[:2] for line in lines[:num_examples]]
    
    input_texts = []
    target_texts = []
    for eng, fra in pairs:
        input_texts.append(eng)
        target_texts.append(f"<start> {fra} <end>")
    
    return input_texts, target_texts

input_texts, target_texts = load_data()

In [22]:
# STEP 4: Tokenize
input_tokenizer = Tokenizer(filters='')
target_tokenizer = Tokenizer(filters='')

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_seqs = input_tokenizer.texts_to_sequences(input_texts)
target_seqs = target_tokenizer.texts_to_sequences(target_texts)


In [23]:
# STEP 5: Pad sequences
input_tensor = pad_sequences(input_seqs, padding='post')
target_tensor = pad_sequences(target_seqs, padding='post')

print("Input tensor shape:", input_tensor.shape)
print("Target tensor shape:", target_tensor.shape)


Input tensor shape: (50000, 7)
Target tensor shape: (50000, 14)


In [24]:
# STEP 7: Vocabulary Sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

print("Input vocab size:", input_vocab_size)
print("Target vocab size:", target_vocab_size)


Input vocab size: 9130
Target vocab size: 17458


task 4 

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense


In [26]:
embedding_dim = 256
units = 512

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1


In [27]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Save encoder states to initialize decoder
encoder_states = [state_h, state_c]


In [28]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(target_vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [29]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 256)    2337280     ['input_5[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 256)    4469248     ['input_6[0][0]']                
                                                                                            

In [30]:
# Remove the first <start> token from target and prepare as labels
decoder_target_data = target_tensor[:, 1:]


In [None]:
batch_size = 64
epochs = 10

model.fit(
    [input_tensor, target_tensor[:, :-1]],
    tf.expand_dims(decoder_target_data, -1),
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1df9aab0d30>

task 5


In [44]:
# Encoder model for inference (takes input sentence, returns hidden states)
encoder_model = Model(encoder_inputs, encoder_states)


In [45]:
# Decoder inputs
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Decoder embedding
dec_emb_inf = dec_emb_layer(decoder_inputs)

# LSTM with previous states
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=decoder_states_inputs)

decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

# Full decoder inference model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf)

In [46]:
target_idx_word = {i: w for w, i in target_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    # Encode the input
    states_value = encoder_model.predict(input_seq)

    # Start with <start> token
    target_seq = np.array([[target_tokenizer.word_index['<start>']]])

    decoded_sentence = ''
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the token with highest probability (greedy search)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_idx_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > 50:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence and states
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return decoded_sentence.strip()


In [48]:
# Pick 5 random sentences to test
for i in range(20):
    input_seq = input_tensor[i:i+1]
    decoded = decode_sequence(input_seq)
    
    print(f"Input    : {input_texts[i]}")
    print(f"Predicted: {decoded}")
    print(f"Target   : {target_texts[i].replace('<start>', '').replace('<end>', '').strip()}")
    print("-" * 50)


Input    : Go.
Predicted: marche.
Target   : Va !
--------------------------------------------------
Input    : Go.
Predicted: marche.
Target   : Marche.
--------------------------------------------------
Input    : Go.
Predicted: marche.
Target   : En route !
--------------------------------------------------
Input    : Go.
Predicted: marche.
Target   : Bouge !
--------------------------------------------------
Input    : Hi.
Predicted: salut !
Target   : Salut !
--------------------------------------------------
Input    : Hi.
Predicted: salut !
Target   : Salut.
--------------------------------------------------
Input    : Run!
Predicted: cours !
Target   : Cours !
--------------------------------------------------
Input    : Run!
Predicted: cours !
Target   : Courez !
--------------------------------------------------
Input    : Run!
Predicted: cours !
Target   : Prenez vos jambes à vos cous !
--------------------------------------------------
Input    : Run!
Predicted: cours !
Tar