<a href="https://colab.research.google.com/github/Kavyapm1960/project/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess data
def load_data(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text.split('\n')

def tokenize(sentences):
    tokenizer = Tokenizer(char_level=False)
    tokenizer.fit_on_texts(sentences)
    return tokenizer, tokenizer.texts_to_sequences(sentences)

def pad(sequences, maxlen=None):
    return pad_sequences(sequences, padding='post', maxlen=maxlen)

english_sentences = load_data('/content/drive/MyDrive/small_vocab_en.txt')
french_sentences = load_data('/content/drive/MyDrive/small_vocab_fr.txt')

english_tokenizer, english_tokenized = tokenize(english_sentences)
french_tokenizer, french_tokenized = tokenize(french_sentences)

max_english_length = max(len(sentence) for sentence in english_tokenized)
max_french_length = max(len(sentence) for sentence in french_tokenized)

english_padded = pad(english_tokenized, maxlen=max_english_length)
french_padded = pad(french_tokenized, maxlen=max_french_length)

encoder_input_data = np.array(english_padded)
decoder_input_data = np.array(french_padded)
decoder_output_data = np.zeros_like(decoder_input_data)
decoder_output_data[:, :-1] = decoder_input_data[:, 1:]

# Define model architecture
embedding_size = 256
lstm_units = 512
num_encoder_tokens = len(english_tokenizer.word_index) + 1
num_decoder_tokens = len(french_tokenizer.word_index) + 1

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(num_decoder_tokens, embedding_size)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Apply attention mechanism
attention = Attention()
context = attention([decoder_outputs, encoder_outputs])
decoder_combined_context = tf.concat([context, decoder_outputs], axis=-1)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare decoder output data
decoder_output_one_hot = np.expand_dims(decoder_output_data, -1)

# Train the model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=64,
    epochs=30,
    validation_split=0.2
)

# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embedding = decoder_embedding_layer(decoder_inputs)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Apply attention during inference
encoder_outputs_inf, state_h_inf, state_c_inf = encoder_lstm(encoder_embedding)
encoder_states_inf = [state_h_inf, state_c_inf]
attention_inf = Attention()
context_inf = attention_inf([decoder_outputs, encoder_outputs_inf])
decoder_combined_context_inf = tf.concat([context_inf, decoder_outputs], axis=-1)

decoder_outputs_inf = decoder_dense(decoder_combined_context_inf)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs + [encoder_inputs],
    [decoder_outputs_inf] + decoder_states
)

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start token.
    target_seq[0, 0] = french_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value + [input_seq])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_french_tokenizer.get(sampled_token_index, '')

        # Exit condition: either hit max length or find stop token.
        if sampled_word == '<end>' or len(decoded_sentence) > max_french_length:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

reverse_french_tokenizer = {i: word for word, i in french_tokenizer.word_index.items()}

# Perform translation
while True:
    input_sentence = input("Enter English sentence to translate (or type 'quit' to exit): ")
    if input_sentence.lower() == 'quit':
        break
    input_seq = english_tokenizer.texts_to_sequences([input_sentence])
    padded_input_seq = pad(input_seq, maxlen=max_english_length)
    translation = decode_sequence(padded_input_seq)
    print("French translation:", translation)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

In [None]:
while True:
    input_sentence = input("Enter English sentence to translate (or type 'quit' to exit): ")
    if input_sentence.lower() == 'quit':
        break

    # Tokenize the input sentence
    input_seq = english_tokenizer.texts_to_sequences([input_sentence])
    # Pad the input sequence
    padded_input_seq = pad(input_seq, maxlen=max_english_length)

    # Encode the input sequence
    states_value = encoder_model.predict(padded_input_seq)

    # Initialize the target sequence with a start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = french_tokenizer.word_index['<start>']

    # Translate sentence one token at a time
    translated_sentence = ''
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the index of the predicted word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Map the index to the corresponding word
        sampled_word = reverse_french_tokenizer[sampled_token_index]

        # Break the loop if the end token is encountered
        if sampled_word == '<end>' or len(translated_sentence.split()) >= max_french_length:
            break

        # Append the predicted word to the translated sentence
        translated_sentence += sampled_word + ' '

        # Update the target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    print("French translation:", translated_sentence)


Enter English sentence to translate (or type 'quit' to exit): hi
French translation: aime les en en en en en en en en en en en en en en en en en en en 
Enter English sentence to translate (or type 'quit' to exit): happy
French translation: aime les en en en en en en en en en en en en en en en en en en en 
Enter English sentence to translate (or type 'quit' to exit): you are so sweet
French translation: aime les en en en en en en en en en en en en en en en en en en en 
Enter English sentence to translate (or type 'quit' to exit): like them
French translation: aime les en en en en en en en en en en en en en en en en en en en 
Enter English sentence to translate (or type 'quit' to exit): quit


In [None]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

model_name = "t5-small"  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

def translate(text, model, tokenizer):
    inputs = tokenizer.encode("translate English to French: " + text, return_tensors="tf")
    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example translation
translate("Hello, how are you?", model, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


'Bonjour, comment êtes-vous?'

In [None]:
translate("", model, tokenizer)

'hi'