In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading Dataset

In [96]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/Assignment 2/e2f.csv")
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [97]:
df.shape

(175621, 2)

In [98]:
df = df.drop(df.index[10000:175621])
df.shape

(10000, 2)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   English words/sentences  10000 non-null  object
 1   French words/sentences   10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [100]:
input_texts = df["English words/sentences"].tolist()
target_texts = df["French words/sentences"].tolist()

In [101]:
target_texts[:5]

['Salut!', 'Cours\u202f!', 'Courez\u202f!', 'Qui ?', 'Ça alors\u202f!']

# Text Preprocessing

In [102]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_data(input_texts, target_texts):
    # Initialize input and target tokenizers
    input_tokenizer = Tokenizer()
    target_tokenizer = Tokenizer()

    # Fit tokenizers on input and target texts
    input_tokenizer.fit_on_texts(input_texts)
    target_tokenizer.fit_on_texts(target_texts)

    # Add '<start>' and '<end>' tokens to the target tokenizer vocabulary
    target_tokenizer.word_index['<start>'] = len(target_tokenizer.word_index) + 1
    target_tokenizer.word_index['<end>'] = len(target_tokenizer.word_index) + 1

    # Update the reverse index as well
    target_tokenizer.index_word[len(target_tokenizer.word_index) - 1] = '<start>'
    target_tokenizer.index_word[len(target_tokenizer.word_index)] = '<end>'

    # Convert texts to sequences
    input_sequences = input_tokenizer (input_texts)
    target_sequences = target_tokenizer.texts_to_sequences(target_texts)

    # Pad sequences
    input_sequences = pad_sequences(input_sequences, padding='post')
    target_sequences = pad_sequences(target_sequences, padding='post')

    return input_sequences, target_sequences, input_tokenizer, target_tokenizer

In [103]:
input_sequences, target_sequences, input_tokenizer, target_tokenizer = preprocess_data(input_texts, target_texts)

TypeError: 'Tokenizer' object is not callable

# Building Model

In [104]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.models import Model

In [105]:
def build_model(input_vocab_size, target_vocab_size, units):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, units)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(target_vocab_size, units)(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Attention Layer
    attention = Attention()
    context_vector = attention([decoder_outputs, encoder_outputs])
    decoder_combined_context = tf.concat([context_vector, decoder_outputs], axis=-1)

    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_combined_context)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [106]:
model = build_model(input_vocab_size=len(input_tokenizer.word_index) + 1,
                    target_vocab_size=len(target_tokenizer.word_index) + 1,
                    units=256)

In [107]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

After training the model for 20 epochs, the accuracy turned out to be 95%.

In [None]:
model.fit([input_sequences, target_sequences[:, :-1]], target_sequences[:, 1:], batch_size=64, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7abba3047430>

# Translation Section

Translating English to French

In [None]:
import numpy as np

def translate_sentence(input_sentence, input_tokenizer, target_tokenizer, max_target_length, model):
    # Encode the input sentence
    input_sequence = input_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, padding='post')

    # Initialize the decoder input with a start token
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = target_tokenizer.word_index['<start>']

    # Initialize the translated sentence
    translated_sentence = ''

    # Iteratively decode
    while len(translated_sentence.split()) < max_target_length:
        output_tokens = model.predict([input_sequence, target_sequence])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_tokenizer.index_word.get(sampled_token_index, None)

        # Exit condition: if the predicted token is unknown or if the maximum length is reached
        if sampled_char is None:
            break

        # Append the predicted token to the translated sentence
        translated_sentence += sampled_char + ' '

        # Update the target sequence for the next iteration
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index

    return translated_sentence.strip()

In [95]:
output = translate_sentence("Hi", input_tokenizer, target_tokenizer, 1, model)
output



'heureux'

# BLEU Score

In [108]:
from nltk.translate.bleu_score import sentence_bleu

reference = [['Salut']]  # List of reference translations
candidate = output.split()  # Candidate translation

bleu_score = sentence_bleu(reference, candidate)
print("BLEU Score:", bleu_score)

BLEU Score: 0
