In [None]:
import collections
import numpy as np
import json

from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
def load_data(path, num_sentences=None):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    sentences = data.split('\n')
    if num_sentences:
        return sentences[:num_sentences]
    return sentences

english_sentences = load_data('IITB.en-hi.en', 50000)
hindi_sentences = load_data('IITB.en-hi.hi', 50000)

In [None]:
english_sentences[:5]

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer',
 'The default plugin layout for the bottom panel',
 'The default plugin layout for the top panel',
 'A list of plugins that are disabled by default']

In [None]:
import re
hindi_sentences = [re.sub(r'[a-zA-Z]','',hi) for hi in hindi_sentences]

In [None]:
english_sentences[:5]

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer',
 'The default plugin layout for the bottom panel',
 'The default plugin layout for the top panel',
 'A list of plugins that are disabled by default']

In [None]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

In [None]:
def preprocess(x,y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer = preprocess(english_sentences, hindi_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_hindi_sequence_length = preproc_hindi_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
hindi_vocab_size = len(hindi_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max Hindi sentence length:", max_hindi_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Hindi vocabulary size:", hindi_vocab_size)

Data Preprocessed
Max English sentence length: 115
Max Hindi sentence length: 115
English vocabulary size: 3736
Hindi vocabulary size: 3925


In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
hindi_words_counter = collections.Counter([word for sentence in hindi_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

print()
print('{} Hindi words.'.format(len([word for sentence in hindi_sentences for word in sentence.split()])))
print('{} unique Hindi words.'.format(len(hindi_words_counter)))
print('10 Most common words in the Hindi dataset:')
print('"' + '" "'.join(list(zip(*hindi_words_counter.most_common(10)))[0]) + '"')

213917 English words.
4908 unique English words.
10 Most common words in the English dataset:
"the" "to" "of" "s" "a" "The" "for" "in" "file" "not"

197745 Hindi words.
3890 unique Hindi words.
10 Most common words in the Hindi dataset:
"के" "को" "है" "करें" "नहीं" "का" "में" "से" "लिए" "फ़ाइल"


In [None]:
import re

# Function to clean unwanted symbols from English
def clean_text(text):
    cleaned = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only English letters and spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)      # Replace multiple spaces with single space
    return cleaned.strip()

# Function to clean unwanted symbols from Hindi
def clean_hindi_text(text):
    cleaned = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep only Devanagari letters and spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

# Assuming english_sentences and hindi_sentences are lists of strings:
english_sentences = [clean_text(sentence) for sentence in english_sentences]
hindi_sentences = [clean_hindi_text(sentence) for sentence in hindi_sentences]


In [None]:
import re

# Example: replace this with your full dataset as a string or list of strings
english_text = "the to _ of a The for in be not"
hindi_text = "के को ) करें है (_ का नहीं में लिए"

# Function to clean unwanted symbols
def clean_text(text):
    # Remove all symbols except letters and space
    cleaned = re.sub(r'[^a-zA-Z\s]', '', text)  # for English
    cleaned = re.sub(r'\s+', ' ', cleaned)  # remove extra spaces
    return cleaned.strip()

def clean_hindi_text(text):
    # Keep only Devanagari letters, Hindi numerals, and spaces
    cleaned = re.sub(r'[^\u0900-\u097F\s]', '', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

# Clean the text
cleaned_english = clean_text(english_text)
cleaned_hindi = clean_hindi_text(hindi_text)

print("Cleaned English:", cleaned_english)
print("Cleaned Hindi:", cleaned_hindi)


Cleaned English: the to of a The for in be not
Cleaned Hindi: के को करें है का नहीं में लिए


In [None]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):

    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])

    return model

tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_hindi_sentences.shape[-2], 1))

#Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

simple_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=128, epochs=10, validation_split=0.2,verbose=1)

  super().__init__(**kwargs)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 309ms/step - accuracy: 0.9601 - loss: 0.6647 - val_accuracy: 0.9600 - val_loss: nan
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 303ms/step - accuracy: 0.9668 - loss: 0.2238 - val_accuracy: 0.9597 - val_loss: nan
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 303ms/step - accuracy: 0.9666 - loss: 0.2136 - val_accuracy: 0.9601 - val_loss: nan
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 303ms/step - accuracy: 0.9664 - loss: 0.2075 - val_accuracy: 0.9598 - val_loss: nan
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 317ms/step - accuracy: 0.9670 - loss: 0.1970 - val_accuracy: 0.9597 - val_loss: nan
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 317ms/step - accuracy: 0.9673 - loss: 0.1893 - val_accuracy: 0.9597 - val_loss: nan
Epoch 7/10
[1m313/313[0

<keras.src.callbacks.history.History at 0x7e1f46c17550>

In [None]:
# Print prediction(s)
print("Prediciton:")
# Use the bidirectional model instead of the simple model
print(logits_to_text(bidirectional_model.predict(tmp_x[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(cleaned_hindi_sentences[:1])


print('\nOriginal text:')
print(cleaned_english_sentences[:1])

Prediciton:


NameError: name 'bidirectional_model' is not defined

In [None]:
import collections
import numpy as np
import json
import string
import re

from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf # Import tensorflow

# Load Data
def load_data(path, num_sentences=None):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    sentences = data.split('\n')
    if num_sentences:
        return sentences[:num_sentences]
    return sentences

english_sentences = load_data('IITB.en-hi.en', 50000)
hindi_sentences = load_data('IITB.en-hi.hi', 50000)

# Clean Data
def preprocess_text(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

cleaned_english_sentences = [preprocess_text(sentence) for sentence in english_sentences]
cleaned_hindi_sentences = [preprocess_text(sentence) for sentence in hindi_sentences]

# Tokenize and Pad Data
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

def preprocess_data(x_cleaned, y_cleaned):
    preprocess_x, x_tk = tokenize(x_cleaned)
    preprocess_y, y_tk = tokenize(y_cleaned)

    # Calculate max Hindi sequence length *before* padding English
    max_hindi_sequence_length_local = max([len(sentence) for sentence in preprocess_y])

    # Pad English sequences to the length of Hindi sequences
    preprocess_x = pad(preprocess_x, max_hindi_sequence_length_local)
    preprocess_y = pad(preprocess_y)

    print("Shape of preprocessed English sentences:", preprocess_x.shape)
    print("Shape of preprocessed Hindi sentences:", preprocess_y.shape)


    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer = preprocess_data(cleaned_english_sentences, cleaned_hindi_sentences)

# Calculate and print data statistics
max_english_sequence_length = preproc_english_sentences.shape[1]
max_hindi_sequence_length = preproc_hindi_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index) + 1 # Add 1 for padding token
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1   # Add 1 for padding token

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max Hindi sentence length:", max_hindi_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Hindi vocabulary size:", hindi_vocab_size)

# Build the Model
def simple_model_with_embedding(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):

    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    # Add Embedding layer to convert integer sequences to dense vectors
    model.add(Embedding(input_dim=english_vocab_size, output_dim=256)) # Removed input_length
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])

    return model

# Train the Model
tmp_x_fixed = preproc_english_sentences
tmp_y_fixed = preproc_hindi_sentences

# Explicitly cast target data to int32
tmp_y_fixed = tf.cast(tmp_y_fixed, dtype=tf.int32)


# Train the updated neural network
simple_rnn_model_fixed = simple_model_with_embedding(
    tmp_x_fixed.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

simple_rnn_model_fixed.fit(tmp_x_fixed, tmp_y_fixed, batch_size=256, epochs=20, validation_split=0.2,verbose=1)

# Make and Print Prediction
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print("\nPrediciton:")
print(logits_to_text(simple_rnn_model_fixed.predict(tmp_x_fixed[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(cleaned_hindi_sentences[:1])

print('\nOriginal text:')
print(cleaned_english_sentences[:1])

Shape of preprocessed English sentences: (50000, 111)
Shape of preprocessed Hindi sentences: (50000, 111)
Data Preprocessed
Max English sentence length: 111
Max Hindi sentence length: 111
English vocabulary size: 3629
Hindi vocabulary size: 4863
Epoch 1/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 349ms/step - accuracy: 0.9433 - loss: 0.5540 - val_accuracy: 0.9572 - val_loss: 0.3693
Epoch 2/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 351ms/step - accuracy: 0.9716 - loss: 0.1556 - val_accuracy: 0.9583 - val_loss: 0.3822
Epoch 3/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 350ms/step - accuracy: 0.9785 - loss: 0.1025 - val_accuracy: 0.9584 - val_loss: 0.3879
Epoch 4/20
[1m110/313[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1:04[0m 319ms/step - accuracy: 0.9822 - loss: 0.0808

KeyboardInterrupt: 

In [None]:
def translate_sentence(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    # Clean the input sentence
    cleaned_sentence = preprocess_text(input_sentence)

    # Tokenize and pad the input sentence
    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length)

    # Get the model's prediction
    prediction = model.predict(padded_input_sequence)[0]

    # Convert the prediction to text
    translated_sentence = logits_to_text(prediction, hindi_tokenizer)

    return translated_sentence

# Example usage:
user_input = "This is a test sentence"
translated_output = translate_sentence(user_input, english_tokenizer, hindi_tokenizer, bidirectional_model, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated: {translated_output}")



NameError: name 'bidirectional_model' is not defined

In [None]:
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):

    # Hyperparameters
    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    # Removed input_shape from Embedding layer as it's deprecated when using input_length
    model.add(Embedding(input_dim=english_vocab_size, output_dim=256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])

    return model

# Use the preprocessed English and Hindi sentences from the previous steps
tmp_x = preproc_english_sentences
tmp_y = preproc_hindi_sentences

# Build the model
bidirectional_model = bidirectional_embed_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

print(bidirectional_model.summary())

# Ensure the target data is int32
tmp_y = tf.cast(tmp_y, dtype=tf.int32)

bidirectional_model.fit(tmp_x, tmp_y, batch_size=1024, epochs=10, validation_split=0.2)



None
Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 601ms/step - accuracy: 0.7527 - loss: 2.5860 - val_accuracy: 0.9211 - val_loss: 0.5505
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 524ms/step - accuracy: 0.9141 - loss: 0.6733 - val_accuracy: 0.9223 - val_loss: 0.4936
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 551ms/step - accuracy: 0.9173 - loss: 0.6206 - val_accuracy: 0.9279 - val_loss: 0.4393
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 555ms/step - accuracy: 0.9215 - loss: 0.5688 - val_accuracy: 0.9366 - val_loss: 0.3741
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 560ms/step - accuracy: 0.9279 - loss: 0.5188 - val_accuracy: 0.9450 - val_loss: 0.3126
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 535ms/step - accuracy: 0.9359 - loss: 0.4581 - val_accuracy: 0.9548 - val_loss: 0.2588
Epoch 7/10
[1m20

<keras.src.callbacks.history.History at 0x7cb162c00890>

In [None]:
print("\nPrediciton (Bidirectional Model):")
print(logits_to_text(bidirectional_model.predict(tmp_x[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(cleaned_hindi_sentences[:1])

print('\nOriginal text:')
print(cleaned_english_sentences[:1])


Prediciton (Bidirectional Model):
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 719ms/step
अपने अनुप्रयोग को पहुंचनीयता व्यायाम का <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें']

Original text:
['give your application an accessibility workout']


In [None]:
tmp_x = preproc_english_sentences
tmp_y = preproc_hindi_sentences

# Ensure the target data is int32
tmp_y = tf.cast(tmp_y, dtype=tf.int32)

# Train the model
bidirectional_model.fit(tmp_x, tmp_y, batch_size=1024, epochs=20, validation_split=0.2)

# Check with user input
def translate_sentence(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    # Clean the input sentence
    cleaned_sentence = preprocess_text(input_sentence)

    # Tokenize and pad the input sentence
    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length)

    # Get the model's prediction
    prediction = model.predict(padded_input_sequence)[0]

    # Convert the prediction to text
    translated_sentence = logits_to_text(prediction, hindi_tokenizer)

    return translated_sentence

user_input = "How are you?"
translated_output = translate_sentence(user_input, english_tokenizer, hindi_tokenizer, bidirectional_model, max_hindi_sequence_length)
print(f"\nOriginal: {user_input}")
print(f"Translated: {translated_output}")

user_input = "What is your name?"
translated_output = translate_sentence(user_input, english_tokenizer, hindi_tokenizer, bidirectional_model, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated: {translated_output}")

Epoch 1/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 556ms/step - accuracy: 0.9636 - loss: 0.3171 - val_accuracy: 0.9839 - val_loss: 0.1236
Epoch 2/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 535ms/step - accuracy: 0.9668 - loss: 0.3006 - val_accuracy: 0.9851 - val_loss: 0.1157
Epoch 3/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 562ms/step - accuracy: 0.9689 - loss: 0.2913 - val_accuracy: 0.9872 - val_loss: 0.1069
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 563ms/step - accuracy: 0.9705 - loss: 0.2814 - val_accuracy: 0.9892 - val_loss: 0.1002
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 562ms/step - accuracy: 0.9719 - loss: 0.2773 - val_accuracy: 0.9905 - val_loss: 0.0961
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 564ms/step - accuracy: 0.9737 - loss: 0.2650 - val_accuracy: 0.9903 - val_loss: 0.0959
Epoch 7/20
[1m20/20[

In [None]:
def translate_simple_model(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    cleaned_sentence = preprocess_text(input_sentence)

    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length)

    prediction = model.predict(padded_input_sequence)[0]

    translated_sentence = logits_to_text(prediction, hindi_tokenizer)

    return translated_sentence

user_input = "This is a test sentence"
translated_output = translate_simple_model(user_input, english_tokenizer, hindi_tokenizer, simple_rnn_model_fixed, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated (Simple Model): {translated_output}")

user_input = "Give your application an accessibility workout"
translated_output = translate_simple_model(user_input, english_tokenizer, hindi_tokenizer, simple_rnn_model_fixed, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated (Simple Model): {translated_output}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Original: This is a test sentence
Translated (Simple Model): यह मौजूदा सांकेतिक कर है है <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Original: Give your application an accessibility workout
Translated (Simple Model): अपने को को पहुंचनीयता व्यायाम का लाभ <PAD> <PAD> <PAD> 

In [None]:
import tensorflow as tf

def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):

    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    # Removed input_shape from Embedding layer as it's deprecated when using input_length
    model.add(Embedding(input_dim=english_vocab_size, output_dim=256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])

    return model

# Use the preprocessed English and Hindi sentences from the previous steps
tmp_x = preproc_english_sentences
tmp_y = preproc_hindi_sentences

# Build the model
bidirectional_model = bidirectional_embed_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

print(bidirectional_model.summary())


tmp_y = tf.cast(tmp_y, dtype=tf.int32)

bidirectional_model.fit(tmp_x, tmp_y, batch_size=256, epochs=20, validation_split=0.2)



None
Epoch 1/20


In [None]:

def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print("\nPrediciton (Bidirectional Model):")
# Use tmp_x from the previous cell
print(logits_to_text(bidirectional_model.predict(tmp_x[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(cleaned_hindi_sentences[:1])

print('\nOriginal text:')
print(cleaned_english_sentences[:1])


Prediciton (Bidirectional Model):


NameError: name 'bidirectional_model' is not defined

In [None]:
def preprocess_text(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

def translate_sentence(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    # Clean the input sentence
    cleaned_sentence = preprocess_text(input_sentence)

    # Tokenize and pad the input sentence
    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length)

    # Get the model's prediction
    prediction = model.predict(padded_input_sequence)[0]

    # Convert the prediction to text
    translated_sentence = logits_to_text(prediction, hindi_tokenizer)

    return translated_sentence

# Example usage:
user_input = "This is a test sentence"
# Use the bidirectional model for translation
translated_output = translate_sentence(user_input, english_tokenizer, hindi_tokenizer, bidirectional_model, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated: {translated_output}")

# You can now modify the 'user_input' variable to test with different English sentences.

NameError: name 'english_tokenizer' is not defined

In [None]:
import collections
import numpy as np
import json
import string
import re

from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf # Import tensorflow

# Load Data
def load_data(path, num_sentences=None):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    sentences = data.split('\n')
    if num_sentences:
        return sentences[:num_sentences]
    return sentences

english_sentences = load_data('IITB.en-hi.en', 50000)
hindi_sentences = load_data('IITB.en-hi.hi', 50000)

# Clean Data
def preprocess_text(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)  #Remove extra spaces
  text = text.strip()
  return text

cleaned_english_sentences = [preprocess_text(sentence) for sentence in english_sentences]
cleaned_hindi_sentences = [preprocess_text(sentence) for sentence in hindi_sentences]

# Tokenize and Pad Data
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

def preprocess_data(x_cleaned, y_cleaned):
    preprocess_x, x_tk = tokenize(x_cleaned)
    preprocess_y, y_tk = tokenize(y_cleaned)

    # Calculate max Hindi sequence length *before* padding English
    max_hindi_sequence_length_local = max([len(sentence) for sentence in preprocess_y])

    # Pad English sequences to the length of Hindi sequences
    preprocess_x = pad(preprocess_x, max_hindi_sequence_length_local)
    preprocess_y = pad(preprocess_y)


    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer = preprocess_data(cleaned_english_sentences, cleaned_hindi_sentences)

# Calculate and print data statistics
max_english_sequence_length = preproc_english_sentences.shape[1]
max_hindi_sequence_length = preproc_hindi_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index) + 1 # Add 1 for padding token
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1   # Add 1 for padding token

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max Hindi sentence length:", max_hindi_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Hindi vocabulary size:", hindi_vocab_size)

Data Preprocessed
Max English sentence length: 111
Max Hindi sentence length: 111
English vocabulary size: 3629
Hindi vocabulary size: 4863


In [None]:
import tensorflow as tf

# Build the Bidirectional Model (from cell 64e0f865)
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):

    # Hyperparameters
    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    # Removed input_shape from Embedding layer as it's deprecated when using input_length
    model.add(Embedding(input_dim=english_vocab_size, output_dim=256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])

    return model

# Use the preprocessed English and Hindi sentences from the previous steps
tmp_x = preproc_english_sentences
tmp_y = preproc_hindi_sentences

# Build the model
bidirectional_model = bidirectional_embed_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

print(bidirectional_model.summary())

# Train the model (from cell e23a7d5d)
# Ensure the target data is int32
tmp_y = tf.cast(tmp_y, dtype=tf.int32)

bidirectional_model.fit(tmp_x, tmp_y, batch_size=256, epochs=20, validation_split=0.2)



None
Epoch 1/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 429ms/step - accuracy: 0.9268 - loss: 0.8327 - val_accuracy: 0.9558 - val_loss: 0.4134
Epoch 2/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 404ms/step - accuracy: 0.9648 - loss: 0.2618 - val_accuracy: 0.9572 - val_loss: 0.4138
Epoch 3/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 404ms/step - accuracy: 0.9698 - loss: 0.2139 - val_accuracy: 0.9587 - val_loss: 0.4194
Epoch 4/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 407ms/step - accuracy: 0.9756 - loss: 0.1769 - val_accuracy: 0.9592 - val_loss: 0.4258
Epoch 5/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 405ms/step - accuracy: 0.9796 - loss: 0.1554 - val_accuracy: 0.9595 - val_loss: 0.4266
Epoch 6/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 404ms/step - accuracy: 0.9825 - loss: 0.1410 - val_accuracy: 0.9597 - val_loss: 0.4318
Epoch

<keras.src.callbacks.history.History at 0x785d11fb2f10>

In [None]:
# Make and Print Prediction using the bidirectional model (from cell 0d4bddb6)

def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print("\nPrediciton (Bidirectional Model):")
# Use tmp_x from the previous cell
print(logits_to_text(bidirectional_model.predict(tmp_x[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(cleaned_hindi_sentences[:1])

print('\nOriginal text:')
print(cleaned_english_sentences[:1])


Prediciton (Bidirectional Model):
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
अपने अनुप्रयोग को पहुंचनीयता व्यायाम का <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें']

Original text:
['give your application an accessibility workout']


In [None]:
# prompt: save the model

bidirectional_model.save('bidirectional_translation_model.h5')


In [None]:
# prompt: code to check translation based on a saved model in h5 file

# Function to translate a new English sentence using the loaded model

def translate_sentence_from_loaded_model(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    # Clean the input sentence
    cleaned_sentence = preprocess_text(input_sentence) # Ensure preprocess_text is defined

    # Tokenize and pad the input sentence
    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length) # Ensure pad is defined

    # Get the model's prediction
    prediction = model.predict(padded_input_sequence)[0]

    # Convert the prediction to text
    translated_sentence = logits_to_text(prediction, hindi_tokenizer) # Ensure logits_to_text is defined

    return translated_sentence

# Check translation based on the saved model
print("\nChecking translation with the loaded model:")

# Example user input
user_input_to_test = "This is a test translation using the saved model."

# Ensure the bidirectional_model is loaded from 'bidirectional_translation_model.h5'
# This should have been done in the preceding code block, but we add a check here
# in case the preceding block wasn't fully executed or failed.
try:
    bidirectional_model
except NameError:
    try:
        bidirectional_model = load_model('bidirectional_translation_model.h5')
        print("Model loaded successfully for testing.")
    except Exception as e:
        print(f"Error loading model for testing: {e}")
        bidirectional_model = None # Set to None if loading fails

if bidirectional_model:
    # Ensure english_tokenizer, hindi_tokenizer, and max_hindi_sequence_length are defined
    # These should be available from the data preprocessing steps in previous cells.
    try:
        translated_output_from_loaded = translate_sentence_from_loaded_model(
            user_input_to_test,
            english_tokenizer,
            hindi_tokenizer,
            bidirectional_model,
            max_hindi_sequence_length
        )
        print(f"Original (English): {user_input_to_test}")
        print(f"Translated (Hindi) from saved model: {translated_output_from_loaded}")
    except NameError as e:
         print(f"Error: {e}. Make sure english_tokenizer, hindi_tokenizer, and max_hindi_sequence_length are defined in previous cells.")
else:
    print("Model not available. Cannot perform translation test with the saved model.")



Checking translation with the loaded model:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Original (English): This is a test translation using the saved model.
Translated (Hindi) from saved model: यह एक की जानकारी सकता प्रयोग <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
# Function to translate a new English sentence using the bidirectional model (from cell f1065fa8)

def preprocess_text(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)  #Remove extra spaces
  text = text.strip()
  return text

def translate_sentence(input_sentence, english_tokenizer, hindi_tokenizer, model, max_hindi_sequence_length):
    # Clean the input sentence
    cleaned_sentence = preprocess_text(input_sentence)

    # Tokenize and pad the input sentence
    input_sequence = english_tokenizer.texts_to_sequences([cleaned_sentence])
    padded_input_sequence = pad(input_sequence, max_hindi_sequence_length)

    # Get the model's prediction
    prediction = model.predict(padded_input_sequence)[0]

    # Convert the prediction to text
    translated_sentence = logits_to_text(prediction, hindi_tokenizer)

    return translated_sentence

# Example usage:
user_input = "This is a test sentence"
# Use the bidirectional model for translation
translated_output = translate_sentence(user_input, english_tokenizer, hindi_tokenizer, bidirectional_model, max_hindi_sequence_length)
print(f"Original: {user_input}")
print(f"Translated: {translated_output}")

# You can now modify the 'user_input' variable to test with different English sentences.

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Original: This is a test sentence
Translated: यह एक की <PAD> सकती <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
