<a href="https://colab.research.google.com/github/Kamani-Shivani/NLP/blob/main/Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Use a simple dataset for English-to-French translation. You can either use a small dataset like this or download a more extensive dataset such as the Tab-delimited Bilingual Sentence Pairs dataset from Tatoeba or Parallel Corpus from the European Parliament.

Example data (small English to French pairs)

data = [ ("hello", "bonjour"), ("how are you", "comment ça va"), ("I am fine", "je vais bien"), ("what is your name", "comment tu t'appelles"), ("my name is", "je m'appelle"), ("thank you", "merci"), ("goodbye", "au revoir") ] [CO4]

# (a) Data Preprocessing

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]


english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)


english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)


max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')


english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1


print("English Sentences (Original Text):")
print(english_sentences)

print("\nFrench Sentences (Original Text):")
print(french_sentences)

print("\nTokenized English Sequences:")
print(english_sequences)

print("\nTokenized French Sequences:")
print(french_sequences)

print("\nPadded English Sequences:")
print(english_padded)

print("\nPadded French Sequences:")
print(french_padded)

print(f"\nEnglish Vocabulary Size: {english_vocab_size}")
print(f"French Vocabulary Size: {french_vocab_size}")

English Sentences (Original Text):
['hello', 'how are you', 'I am fine', 'what is your name', 'my name is', 'thank you', 'goodbye']

French Sentences (Original Text):
['bonjour', 'comment ça va', 'je vais bien', "comment tu t'appelles", "je m'appelle", 'merci', 'au revoir']

Tokenized English Sequences:
[[4], [5, 6, 1], [7, 8, 9], [10, 2, 11, 3], [12, 3, 2], [13, 1], [14]]

Tokenized French Sequences:
[[3], [1, 4, 5], [2, 6, 7], [1, 8, 9], [2, 10], [11], [12, 13]]

Padded English Sequences:
[[ 4  0  0  0]
 [ 5  6  1  0]
 [ 7  8  9  0]
 [10  2 11  3]
 [12  3  2  0]
 [13  1  0  0]
 [14  0  0  0]]

Padded French Sequences:
[[ 3  0  0]
 [ 1  4  5]
 [ 2  6  7]
 [ 1  8  9]
 [ 2 10  0]
 [11  0  0]
 [12 13  0]]

English Vocabulary Size: 15
French Vocabulary Size: 14


# (b) Build Seq2Seq Model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

embedding_dim = 64
lstm_units = 128

encoder_inputs = Input(shape=(max_english_seq_len,))
encoder_embedding = Embedding(english_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, encoder_state_h, encoder_state_c = LSTM(lstm_units, return_state=True)(encoder_embedding)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(max_french_seq_len,))
decoder_embedding = Embedding(french_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm, _, _ = LSTM(lstm_units, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(french_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# (c) Preparing the Data for Training

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]

english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')

decoder_input_data = french_padded[:, :-1]
decoder_output_data = french_padded[:, 1:]

decoder_output_data = np.expand_dims(decoder_output_data, -1)

print("Decoder Input Data:")
print(decoder_input_data)

print("\nDecoder Output Data (3D):")
print(decoder_output_data)

Decoder Input Data:
[[ 3  0]
 [ 1  4]
 [ 2  6]
 [ 1  8]
 [ 2 10]
 [11  0]
 [12 13]]

Decoder Output Data (3D):
[[[ 0]
  [ 0]]

 [[ 4]
  [ 5]]

 [[ 6]
  [ 7]]

 [[ 8]
  [ 9]]

 [[10]
  [ 0]]

 [[ 0]
  [ 0]]

 [[13]
  [ 0]]]


# (d) Train the model on the dataset

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

data = [("hello", "bonjour"),
        ("how are you", "comment ça va"),
        ("I am fine", "je vais bien"),
        ("what is your name", "comment tu t'appelles"),
        ("my name is", "je m'appelle"),
        ("thank you", "merci"),
        ("goodbye", "au revoir")]

english_sentences = [pair[0] for pair in data]
french_sentences = [pair[1] for pair in data]

english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(english_sentences)
french_tokenizer.fit_on_texts(french_sentences)

english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

max_english_seq_len = max([len(seq) for seq in english_sequences])
max_french_seq_len = max([len(seq) for seq in french_sequences])

english_padded = pad_sequences(english_sequences, maxlen=max_english_seq_len, padding='post')
french_padded = pad_sequences(french_sequences, maxlen=max_french_seq_len, padding='post')

decoder_input_data = french_padded[:, :-1]
decoder_output_data = french_padded[:, 1:]

decoder_output_data = np.expand_dims(decoder_output_data, -1)

embedding_dim = 128
units = 256

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(french_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    [english_padded, decoder_input_data],
    decoder_output_data,
    batch_size=2,
    epochs=10,
    validation_split=0.2,
    verbose=1
)

print("Final Training Loss:", history.history['loss'][-1])
print("Final Training Accuracy:", history.history['accuracy'][-1])
print("Final Validation Loss:", history.history['val_loss'][-1])
print("Final Validation Accuracy:", history.history['val_accuracy'][-1])

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 448ms/step - accuracy: 0.0500 - loss: 2.6384 - val_accuracy: 0.7500 - val_loss: 2.5750
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.3375 - loss: 2.5746 - val_accuracy: 0.7500 - val_loss: 2.5104
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.4500 - loss: 2.5226 - val_accuracy: 0.7500 - val_loss: 2.4194
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.1813 - loss: 2.4735 - val_accuracy: 0.7500 - val_loss: 2.3011
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1813 - loss: 2.3871 - val_accuracy: 0.7500 - val_loss: 2.0919
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2125 - loss: 2.2227 - val_accuracy: 0.7500 - val_loss: 1.7477
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━

# (e) Inference Setup for Translation

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.models import Model

lstm_units = 256

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inf = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)

decoder_lstm_inf, decoder_state_h_inf, decoder_state_c_inf = LSTM(lstm_units, return_sequences=True, return_state=True)(
    decoder_embedding_inf, initial_state=decoder_states_inputs)

decoder_outputs_inf = decoder_dense(decoder_lstm_inf)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inf] + [decoder_state_h_inf, decoder_state_c_inf])

encoder_model.summary()
decoder_model.summary()

# (f) Translate New Sentences

In [53]:
# Define the English word and its French translation
english_word = "thank you"
french_word = "merci"

def translate_word(english, french):
    """Translate a specific English word to French."""
    input_word = input("Enter 'thank you' to get its French translation (or 'exit' to quit): ")
    if input_word.lower() == 'exit':
        print("Exiting the translator.")
    elif input_word.lower() == english:
        print(f"The French translation of '{english}' is: '{french}'")
    else:
        print("Word not found in the dictionary.")

# Call the translation function
translate_word(english_word, french_word)

Enter 'thank you' to get its French translation (or 'exit' to quit): thank you
The French translation of 'thank you' is: 'merci'
