<a href="https://colab.research.google.com/github/Kavyapm1960/project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess data
def load_data(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text.split('\n')

def tokenize(sentences, add_start_end_tokens=True):
    tokenizer = Tokenizer(char_level=False)
    tokenizer.fit_on_texts(sentences)
    if add_start_end_tokens:
        tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
        tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 2
    return tokenizer, tokenizer.texts_to_sequences(sentences)


def pad(sequences, maxlen=None):
    return pad_sequences(sequences, padding='post', maxlen=maxlen)

english_sentences = load_data('/content/drive/MyDrive/en.txt')
french_sentences = load_data('/content/drive/MyDrive/fr.txt')

english_tokenizer, english_tokenized = tokenize(english_sentences)
french_tokenizer, french_tokenized = tokenize(french_sentences, add_start_end_tokens=True)






max_english_length = max(len(sentence) for sentence in english_tokenized)
max_french_length = max(len(sentence) for sentence in french_tokenized)

english_padded = pad(english_tokenized, maxlen=max_english_length)
french_padded = pad(french_tokenized, maxlen=max_french_length)

encoder_input_data = np.array(english_padded)
decoder_input_data = np.array(french_padded)
decoder_output_data = np.zeros_like(decoder_input_data)
decoder_output_data[:, :-1] = decoder_input_data[:, 1:]






# Define model architecture
embedding_size = 512
lstm_units = 1024
num_encoder_tokens = len(english_tokenizer.word_index) + 1
num_decoder_tokens = len(french_tokenizer.word_index) + 1

# Add Dropout to the LSTM layers
encoder_lstm = LSTM(lstm_units, return_state=True, dropout=0.2, recurrent_dropout=0.2)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)


encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(num_decoder_tokens, embedding_size)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Apply attention mechanism
attention = Attention()
context = attention([decoder_outputs, encoder_outputs])
decoder_combined_context = tf.concat([context, decoder_outputs], axis=-1)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare decoder output data
decoder_output_one_hot = np.expand_dims(decoder_output_data, -1)

# Add EarlyStopping callback
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=32,
    epochs=30,
    validation_split=0.2,
    callbacks=[early_stopping]
)

# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding = decoder_embedding_layer(decoder_inputs)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Apply attention during inference
encoder_outputs_inf, state_h_inf, state_c_inf = encoder_lstm(encoder_embedding)
encoder_states_inf = [state_h_inf, state_c_inf]
attention_inf = Attention()
context_inf = attention_inf([decoder_outputs, encoder_outputs_inf])
decoder_combined_context_inf = tf.concat([context_inf, decoder_outputs], axis=-1)

decoder_outputs_inf = decoder_dense(decoder_combined_context_inf)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs + [encoder_inputs],
    [decoder_outputs_inf] + decoder_states
)

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start token.
    target_seq[0, 0] = french_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value + [input_seq])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_french_tokenizer.get(sampled_token_index, '')

        # Exit condition: either hit max length or find stop token.
        if sampled_word == '<end>' or len(decoded_sentence) > max_french_length:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

reverse_french_tokenizer = {i: word for word, i in french_tokenizer.word_index.items()}
# Save the model
model.save('/content/drive/MyDrive/project/new-model.h5')






Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


  saving_api.save_model(


In [9]:
from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model('/content/drive/MyDrive/project/new-model.h5')
# Perform translation
while True:
    input_sentence = input("Enter English sentence to translate (or type 'quit' to exit): ")
    if input_sentence.lower() == 'quit':
        break
    input_seq = english_tokenizer.texts_to_sequences([input_sentence])
    padded_input_seq = pad(input_seq, maxlen=max_english_length)
    translation = decode_sequence(padded_input_seq)
    print("French translation:", translation)


Enter English sentence to translate (or type 'quit' to exit): the red car
French translation: la voiture rouge 
Enter English sentence to translate (or type 'quit' to exit): quit


In [13]:
!pip install rouge-score



Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=b82fd26e03421d22b50cdd25511ed99f257f0c6190e541b3cca4dfb84a6780f9
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [19]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.6.2


In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from jiwer import wer
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer

# Compute BLEU Score
def compute_bleu_score(reference_sentences, predicted_sentences):
    return corpus_bleu([[ref.split()] for ref in reference_sentences], [pred.split() for pred in predicted_sentences])

# Compute METEOR Score
def compute_meteor_score(reference_sentences, predicted_sentences):
    return meteor_score(reference_sentences, predicted_sentences)

# Compute WER
def compute_wer(reference_sentences, predicted_sentences):
    return wer(reference_sentences, predicted_sentences)

# Compute ROUGE Score
def compute_rouge_score(reference_sentences, predicted_sentences):
    rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougeL_scores = []
    for ref, pred in zip(reference_sentences, predicted_sentences):
        scores = rouge_scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    return avg_rouge1, avg_rougeL

# Evaluate the model
predicted_sentences = []
for input_seq in encoder_input_data:
    decoded_sentence = decode_sequence(input_seq.reshape(1, -1))
    predicted_sentences.append(decoded_sentence)

# Evaluation metrics
bleu_score = compute_bleu_score(french_sentences, predicted_sentences)
meteor_score = compute_meteor_score(french_sentences, predicted_sentences)
wer_score = compute_wer(french_sentences, predicted_sentences)
rouge1_score, rougeL_score = compute_rouge_score(french_sentences, predicted_sentences)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor_score)
print("WER:", wer_score)
print("ROUGE-1 Score:", rouge1_score)
print("ROUGE-L Score:", rougeL_score)

# Plot loss and accuracy
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()




In [11]:
from nltk.translate.bleu_score import corpus_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual, predicted):
    return corpus_bleu([[ref.split()] for ref in actual], [pred.split() for pred in predicted])

# Function to evaluate model performance using BLEU score
def evaluate_model(encoder_input_data, actual_sentences):
    # Generate translations
    predicted_sentences = []
    for seq in encoder_input_data:
        input_seq = np.expand_dims(seq, axis=0)
        translation = decode_sequence(input_seq)
        predicted_sentences.append(translation.strip())

    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sentences, predicted_sentences)

    return bleu_score

# Evaluate model performance and print BLEU score
bleu_score = evaluate_model(encoder_input_data, french_sentences)
print("BLEU Score:", bleu_score)




KeyboardInterrupt: 