<a href="https://colab.research.google.com/github/Kavyapm1960/project/blob/main/trance_jo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
pip install --upgrade tensorflow



In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from sklearn.model_selection import train_test_split

# Load English dataset
with open('/content/drive/MyDrive/project-trans/small_vocab_en.txt', 'r', encoding='utf-8') as f:
    english_sentences = f.read().splitlines()

# Load French dataset
with open('/content/drive/MyDrive/project-trans/small_vocab_fr.txt', 'r', encoding='utf-8') as f:
    french_sentences = f.read().splitlines()

# Tokenize sentences
english_tokenized = [sentence.split() for sentence in english_sentences]
french_tokenized = [sentence.split() for sentence in french_sentences]

# Build vocabulary
english_vocab = set(word for sentence in english_tokenized for word in sentence)
french_vocab = set(word for sentence in french_tokenized for word in sentence)

# Create mappings from words to indices
english_word_to_idx = {word: idx for idx, word in enumerate(english_vocab)}
french_word_to_idx = {word: idx for idx, word in enumerate(french_vocab)}

# Convert sentences to numerical representations
english_numerical = [[english_word_to_idx[word] for word in sentence] for sentence in english_tokenized]
french_numerical = [[french_word_to_idx[word] for word in sentence] for sentence in french_tokenized]

# Pad sequences to ensure uniform length
max_seq_length = max(max(len(seq) for seq in english_numerical), max(len(seq) for seq in french_numerical))
english_padded = tf.keras.preprocessing.sequence.pad_sequences(english_numerical, maxlen=max_seq_length, padding='post')
french_padded = tf.keras.preprocessing.sequence.pad_sequences(french_numerical, maxlen=max_seq_length, padding='post')

# Split the data into training and validation sets
english_train, english_validation, french_train, french_validation = train_test_split(english_padded, french_padded, test_size=0.2)

# Define your model architecture
latent_dim = 256  # Dimensionality of the latent space

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(len(english_vocab), latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(len(french_vocab), latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(french_vocab), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define callbacks
checkpoint_filepath = 'translation_model.weights.h5'  # Adjusted filepath
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_accuracy', mode='max', save_best_only=True)

# Train the model
history = model.fit(x=[english_train, french_train],
                    y=french_train,
                    batch_size=64,
                    epochs=50,
                    validation_data=([english_validation, french_validation], french_validation),
                    callbacks=[model_checkpoint_callback])

# Evaluate the model
evaluation_loss, evaluation_accuracy = model.evaluate(x=[english_validation, french_validation],
                                                      y=french_validation)
print("Evaluation Loss:", evaluation_loss)
print("Evaluation Accuracy:", evaluation_accuracy)

# Generate predictions
predictions = model.predict([english_validation, french_validation])

# Convert numerical predictions back to text
def numerical_to_text(predictions, idx_to_word):
    text_predictions = []
    for seq in predictions:
        text_seq = [idx_to_word[np.argmax(word)] for word in seq]
        text_predictions.append(' '.join(text_seq))
    return text_predictions

# Convert numerical French predictions and validation data back to text
french_idx_to_word = {idx: word for word, idx in french_word_to_idx.items()}
text_predictions = numerical_to_text(predictions, french_idx_to_word)
text_validation = numerical_to_text(french_validation, french_idx_to_word)

# Calculate BLEU score
bleu_score = corpus_bleu([[ref.split()] for ref in text_validation], text_predictions)
print("BLEU Score:", bleu_score)




# Save the model
model.save('final_translation_model.h5')


Epoch 1/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 68ms/step - accuracy: 0.5338 - loss: 0.7934 - val_accuracy: 0.6182 - val_loss: 0.0015
Epoch 2/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 69ms/step - accuracy: 0.6173 - loss: 9.5831e-04 - val_accuracy: 0.6182 - val_loss: 3.1113e-04
Epoch 3/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 69ms/step - accuracy: 0.6179 - loss: 2.3482e-04 - val_accuracy: 0.6182 - val_loss: 9.8014e-05
Epoch 4/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 70ms/step - accuracy: 0.6176 - loss: 6.5488e-05 - val_accuracy: 0.6182 - val_loss: 2.9030e-05
Epoch 5/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 68ms/step - accuracy: 0.6177 - loss: 2.3943e-05 - val_accuracy: 0.6182 - val_loss: 1.2772e-05
Epoch 6/50
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 68ms/step - accuracy: 0.6171 - loss: 9.3397e-06 - va



BLEU Score: 0


In [8]:
import numpy as np

def translate_sentence(model, sentence, english_word_to_idx, french_idx_to_word, max_seq_length):
    # Tokenize input sentence
    sentence_tokens = sentence.split()
    # Convert tokens to numerical representation
    numerical_input = [english_word_to_idx.get(word, 0) for word in sentence_tokens]
    # Pad numerical representation to match max sequence length
    padded_input = tf.keras.preprocessing.sequence.pad_sequences([numerical_input], maxlen=max_seq_length, padding='post')
    # Generate prediction
    prediction = model.predict([padded_input, np.zeros((1, max_seq_length))])
    # Convert numerical prediction to text
    text_prediction = [french_idx_to_word[np.argmax(word)] for word in prediction[0]]
    # Join words into a sentence
    translated_sentence = ' '.join(text_prediction)
    return translated_sentence

# Prompt user to enter English sentence for translation
user_input = input("Enter the English sentence to translate: ")

# Translate user input from English to French
translated_sentence = translate_sentence(model, user_input, english_word_to_idx, french_idx_to_word, max_seq_length)
print("Translated Sentence:", translated_sentence)


Enter the English sentence to translate: happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step
Translated Sentence: la la son aimez aimez aimez aimez aimez aimez amusant amusant amusant amusant amusant amusant amusant amusant amusant amusant amusant amusant amusant amusant
