In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, Dense
import numpy as np

file_input_path = 'fra.txt'
batch_size = 64
epochs = 100
latent_dim = 256 # Latency dimension for encoding space
num_samples = 10000


2024-09-21 10:07:48.039776: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-21 10:07:48.505013: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-21 10:07:48.946247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-21 10:07:49.261501: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-21 10:07:49.352738: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-21 10:07:49.734648: I tensorflow/core/platform/cpu_feature_gu

In [2]:
# Vectorising the data

input_texts = []
target_texts = []
input_characters = set()
output_characters = set()

with open(file_input_path, 'r', encoding = 'utf-8') as f:
    lines = f.read().split('\n')
    

for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text, _ = line.split('\t')
    #print(input_text, target_text)
    # We use  "tab"  as "start sequence" character
    # for targets, and  "\n" as "end sequence" character
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in output_characters:
            output_characters.add(char)



In [3]:
input_characters = sorted(list(input_characters))
output_characters = sorted(list(output_characters))

# Numbers of unique tokens in english
num_encoder_tokens = len(input_characters)

# Numbers of unique tokens in french
num_decoder_tokens = len(output_characters)

# Maxium length of english sentence 
max_encoder_seq_length = max([len(txt) for txt in input_texts])

# Maximum length of french sentence
max_decoder_seq_length = max([len(txt) for txt in target_texts])


In [4]:
# Will generate tokens index based on the sorted list


input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)]
)
target_token_index = dict(
    [(char, i) for i, char in enumerate(output_characters)]
)

# List of tuples of length 2 when passed in dict then it'll create a dictionary

In [5]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype = 'float32'
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype = 'float32'
)

decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype = 'float32'
)

In [6]:
len(input_texts), max_encoder_seq_length, num_encoder_tokens

(10000, 14, 70)

In [7]:
(len(input_texts), max_decoder_seq_length, num_decoder_tokens)

(10000, 59, 91)

In [8]:
encoder_input_data[0].shape

(14, 70)

In [9]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    #print(i, input_text, target_text)
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    # This is to tell that word has ended
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.

    for t, char in enumerate(target_text):
        # Decoder  target data is ahead of decoder input data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder target is one step ahead of decoder input and will not include the start character
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.



In [10]:
# Define an input sequence and process it
encoder_inputs = Input(shape = (None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We'll discard encoder outputs and will only keep states
encoder_states = [state_h, state_c]

In [11]:
# Setup decoder using 'decoder states' as inital state
decoder_inputs = Input(shape = (None, num_decoder_tokens))
# We'll set up our decoder to return full output sequences,
# And internal states as well. We don't use them in traning but we'll use them for inference

decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)



In [12]:
# Define model that will turn encoder_input_data & decoder_input_data into decoder_target_data
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run traning
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
          batch_size = batch_size,
          epochs = epochs,
          validation_split = 0.2)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 206ms/step - accuracy: 0.7053 - loss: 1.5324 - val_accuracy: 0.7053 - val_loss: 1.0773
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 155ms/step - accuracy: 0.7467 - loss: 0.9547 - val_accuracy: 0.7174 - val_loss: 1.0125
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 187ms/step - accuracy: 0.7629 - loss: 0.8581 - val_accuracy: 0.7574 - val_loss: 0.8654
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 181ms/step - accuracy: 0.7861 - loss: 0.7583 - val_accuracy: 0.7761 - val_loss: 0.7737
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 166ms/step - accuracy: 0.8053 - loss: 0.6757 - val_accuracy: 0.7963 - val_loss: 0.7159
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 181ms/step - accuracy: 0.8172 - loss: 0.6286 - val_accuracy: 0.8056 - val_loss: 0.6760
Epoc

In [61]:
model

<Functional name=functional_1, built=True>