<a href="https://colab.research.google.com/github/Geetika2282/NLP-AI/blob/main/29_Seq2Seq__Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

### Sample data - English to French translation

In [9]:
english_sentences = ['hello', 'how are you', 'good morning', 'good night', 'thank you']
spanish_sentences = ['hola', 'cómo estás', 'buenos días', 'buenas noches', 'gracias']

### add START and END tokens to the French Sentences

In [10]:
spanish_sentences = ['starttoken '+sentence+' endtoken' for sentence in spanish_sentences]

# Hyperparameters
batch_size=2
epochs=100
latent_dim = 256  # Dimentionality of encoding space

In [11]:
spanish_sentences

['starttoken hola endtoken',
 'starttoken cómo estás endtoken',
 'starttoken buenos días endtoken',
 'starttoken buenas noches endtoken',
 'starttoken gracias endtoken']

### Initialize the Tokenizer for both source and target languages

In [15]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
eng_tokenizer = Tokenizer(char_level=False)
spa_tokenizer = Tokenizer(char_level=False)

### Fit the tokenizer on the sentences

In [16]:
# pass the tokenizer on sentences
eng_tokenizer.fit_on_texts(english_sentences)
spa_tokenizer.fit_on_texts(spanish_sentences)

In [17]:
eng_tokenizer.word_index

{'you': 1,
 'good': 2,
 'hello': 3,
 'how': 4,
 'are': 5,
 'morning': 6,
 'night': 7,
 'thank': 8}

In [18]:
spa_tokenizer.word_index

{'starttoken': 1,
 'endtoken': 2,
 'hola': 3,
 'cómo': 4,
 'estás': 5,
 'buenos': 6,
 'días': 7,
 'buenas': 8,
 'noches': 9,
 'gracias': 10}

### Convert the sentences into sequences of integers

In [20]:
encoder_input_data = eng_tokenizer.texts_to_sequences(english_sentences)
decoder_input_data = spa_tokenizer.texts_to_sequences(spanish_sentences)

### Pad the sequences to ensure uniform length

In [21]:
max_encoder_seq_length = max([len(seq) for seq in encoder_input_data])
max_decoder_seq_length = max([len(seq) for seq in decoder_input_data])

In [22]:
max_encoder_seq_length

3

In [23]:
max_decoder_seq_length

4

In [24]:
encoder_input_data = pad_sequences(encoder_input_data, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(decoder_input_data,maxlen=max_decoder_seq_length, padding='post')

In [25]:
encoder_input_data

array([[3, 0, 0],
       [4, 5, 1],
       [2, 6, 0],
       [2, 7, 0],
       [8, 1, 0]], dtype=int32)

In [26]:
decoder_input_data

array([[ 1,  3,  2,  0],
       [ 1,  4,  5,  2],
       [ 1,  6,  7,  2],
       [ 1,  8,  9,  2],
       [ 1, 10,  2,  0]], dtype=int32)

### Prepare the target data (decoder output, shifted by one time step)

In [27]:
decoder_output_data = np.zeros_like(decoder_input_data)
decoder_output_data[:,:-1] = decoder_input_data[:,1:]

In [28]:
decoder_output_data

array([[ 3,  2,  0,  0],
       [ 4,  5,  2,  0],
       [ 6,  7,  2,  0],
       [ 8,  9,  2,  0],
       [10,  2,  0,  0]], dtype=int32)

In [29]:
decoder_input_data

array([[ 1,  3,  2,  0],
       [ 1,  4,  5,  2],
       [ 1,  6,  7,  2],
       [ 1,  8,  9,  2],
       [ 1, 10,  2,  0]], dtype=int32)

### Define the vocabulary size (total unique words)

In [30]:
num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(spa_tokenizer.word_index) + 1

In [31]:
num_encoder_tokens

9

In [32]:
num_decoder_tokens

11

### Define the input sequence and output sequence for the Seq2Seq model

In [33]:
encoder_inputs = Input(shape=(None,))
decoder_inputs = Input(shape=(None,))

In [34]:
encoder_inputs

<KerasTensor shape=(None, None), dtype=float32, sparse=False, name=keras_tensor>

### Encoder

In [35]:
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=latent_dim, input_length=max_encoder_seq_length)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)



### Decoder

In [36]:
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=latent_dim, input_length=max_decoder_seq_length)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

### Dense layer for generating predictions

In [37]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### Define the model

In [38]:
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [39]:
model.summary()

### Compile the model

In [40]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Train the model

In [41]:
model.fit([encoder_input_data,decoder_input_data],
          np.expand_dims(decoder_output_data,-1),
          batch_size=batch_size,epochs=epochs)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - accuracy: 0.2031 - loss: 2.3891
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.3469 - loss: 2.3004
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3156 - loss: 2.1811
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3938 - loss: 1.9213
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.3625 - loss: 1.6352
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.3313 - loss: 1.5586
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.3938 - loss: 1.2691
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.6125 - loss: 1.3060
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7f18f73acbb0>

### Create inference models for the encoder and decoder for prediction
### Encoder Model

In [42]:
encoder_model = Model(encoder_inputs, [encoder_outputs,state_h, state_c])

### Decoder Model (for inference)

In [43]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_lstm_inf = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_output_inf = decoder_dense(decoder_outputs_inf)
decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c], [decoder_output_inf, state_h_inf, state_c_inf])

### Function to decode the sequence

In [49]:
def decode_sequence(input_seq):
  # Get the encoder states
  states_value = encoder_model.predict(input_seq)

  # Generate an initial target sequence (the start token)
  target_seq = np.zeros((1,1))
  target_seq[0,0] = spa_tokenizer.word_index['starttoken']  # start token index

  # sample output tokens
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    # Correct the prediction input to match expected inputs
    # Unpack the states_value list into separate arguments
    output_tokens, h, c = decoder_model.predict([target_seq, states_value[1], states_value[2]])

    # Sample the next token
    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_token = spa_tokenizer.index_word[sampled_token_index]

    decoded_sentence += ' ' + sampled_token

    # Stop if we hit the end token or reach max length
    if sampled_token == 'endtoken' or len(decoded_sentence) > max_decoder_seq_length:
      stop_condition = True

    # update the target sequence
    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    # update the states
    states_value = [h,c]

  return decoded_sentence

### Test the decoder with the sample sentence

In [50]:
input_seq = encoder_input_data[2:3] # sample input sequence

In [51]:
input_seq

array([[2, 6, 0]], dtype=int32)

In [52]:
encoder_input_data

array([[3, 0, 0],
       [4, 5, 1],
       [2, 6, 0],
       [2, 7, 0],
       [8, 1, 0]], dtype=int32)

In [53]:
decoded_sentence = decode_sequence(input_seq)
print('Decoded Sentence: ',decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
Decoded Sentence:   buenos


In [54]:
new = ['good night']
new_input_seq = eng_tokenizer.texts_to_sequences(new)
new_input_seq = pad_sequences(new_input_seq, maxlen=max_encoder_seq_length, padding='post')
decoded_sentence = decode_sequence(new_input_seq)
print('Decoded Sentence: ', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Decoded Sentence:   buenas
