<a href="https://colab.research.google.com/github/Munna-Prasad-Gupta/DL/blob/main/GRU_LanguageTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import collections


hf = pd.read_excel('tenthou.xlsx')

hf.columns
hf.shape

(50225, 2)

In [7]:
hindi_sentences = hf['hindi']
telugu_sentences = hf['telugu']
hindi_sentences.head()

Unnamed: 0,hindi
0,आभूषणों की निंदा करना हमारा उद्देश्य नहीं है
1,हम असहयोग का उत्पीड़न सह सकते हैं पर ललनाओं के ...
2,तो भी इतना अवश्य कहेंगे कि इस तृष्णा की पूर्ति...
3,यद्यपि हमने किसी रूप हीना महिला को आभूषणों की ...
4,किन्तु शारीरिक शोभा के लिए हम तन को कितना मलिन...


In [8]:
hindi_words_counter = collections.Counter(
    word for sentence in hindi_sentences if isinstance(sentence, str) for word in sentence.lower().split()
)
telugu_words_counter = collections.Counter(
    word for sentence in telugu_sentences if isinstance(sentence, str) for word in sentence.lower().split()
)

print('{} hindi words.'.format(len([word for sentence in hindi_sentences if isinstance(sentence,str) for word in sentence.split()])))
print('{} unique hindi words.'.format(len(hindi_words_counter)))
print('10 Most common words in the Hindi dataset:')
print('"' + '" "'.join(list(zip(*hindi_words_counter.most_common(10)))[0]) + '"')
print()
print('{} telugu words.'.format(len([word for sentence in telugu_sentences if isinstance(sentence,str) for word in sentence.split()])))
print('{} unique telugu words.'.format(len(telugu_words_counter)))
print('10 Most common words in the telugu dataset:')
print('"' + '" "'.join(list(zip(*telugu_words_counter.most_common(10)))[0])+'"')

584354 hindi words.
25748 unique hindi words.
10 Most common words in the Hindi dataset:
"है" "में" "तो" "नहीं" "के" "से" "और" "की" "का" "हो"

388883 telugu words.
60400 unique telugu words.
10 Most common words in the telugu dataset:
"మరియు" "నేను" "ఈ" "కూడా" "మీరు" "కానీ" "నా" "అతను" "అతని" "చాలా"


In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer_hindi = Tokenizer()
tokenizer_telugu=Tokenizer()

In [13]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 847570824963107596
xla_global_id: -1
]


In [14]:
def word_tokenize(sentences):
    words = []
    for s in sentences:
        if isinstance(s, str):  # Check if the entry is a string
            words.append(s.split())
        else:
            words.append([])  # Append an empty list or handle as needed
    return words

# Now call the function
tokenized_hindi_words = word_tokenize(hindi_sentences)
tokenized_telugu_words = word_tokenize(telugu_sentences)

In [15]:
tokenizer_hindi.fit_on_texts(tokenized_hindi_words)

len(tokenizer_hindi.word_index)

25748

In [16]:
tokenizer_telugu.fit_on_texts(tokenized_telugu_words)

len(tokenizer_telugu.word_index)

60400

In [18]:
def sentence_to_vector(sentences,tokenizer):
  input = []
  for s in sentences:
    if isinstance(s, str):  # Check if the entry is a string
      input.append(tokenizer.texts_to_sequences([s])[0])
  return input


hindi_sentence_vectors = sentence_to_vector(hindi_sentences,tokenizer_hindi)
telugu_sentence_vectors = sentence_to_vector(telugu_sentences,tokenizer_telugu)

len(hindi_sentence_vectors)
len(telugu_sentence_vectors)

50216

In [19]:
hindi_sentence_vectors[0]
telugu_sentence_vectors[0]

[4482, 7049, 79, 2095, 70]

In [20]:
maxlength_hindi_sent =max([len(x) for x in hindi_sentence_vectors])
maxlength_telugu_sent =max([len(x) for x in telugu_sentence_vectors])

print(f"max length of hindi sentence is {maxlength_hindi_sent}")
print(f"max length of telugu sentence is {maxlength_telugu_sent}")

max length of hindi sentence is 87
max length of telugu sentence is 59


In [21]:
# max length of hindi sentence is 85
# max length of telugu sentence is 57

padded_hindi_sentences = pad_sequences(hindi_sentence_vectors,maxlen=maxlength_hindi_sent,padding='post')
padded_telugu_sentences = pad_sequences(telugu_sentence_vectors,maxlen=maxlength_telugu_sent,padding='post')

X=padded_hindi_sentences
y=padded_telugu_sentences

len(padded_hindi_sentences)
len(padded_telugu_sentences)

hindi_vocab_size = len(hindi_words_counter) + 1
telugu_vocab_size= len(telugu_words_counter) +1

print(f"hindi vocab size is {hindi_vocab_size}")
print(f"telugu vocab size is {telugu_vocab_size}")

hindi vocab size is 25749
telugu vocab size is 60401


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, TimeDistributed, Dense, Dropout, Embedding, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

# Model parameters
embedding_size = 256  # Size of the embedding vector
units = 512  # Number of GRU units
learning_rate = 0.005

def build_gru_model(input_length, output_length, hindi_vocab_size, telugu_vocab_size):
    model = Sequential()

    # Embedding layer for Hindi sentences
    model.add(Embedding(hindi_vocab_size, embedding_size, input_length=input_length))

    # GRU layer that outputs a fixed-length vector
    model.add(GRU(units))

    # Repeat the output vector to match the length of the target sequence (Telugu sentence length)
    model.add(RepeatVector(output_length))

    # GRU layer with return_sequences=True to match the Telugu sentence length (57)
    model.add(GRU(units, return_sequences=True))

    # Time-distributed dense layer to output the same length as the target (Telugu)
    model.add(TimeDistributed(Dense(1024, activation='relu')))

    # Dropout layer for regularization
    model.add(Dropout(0.5))

    # Output layer with softmax activation for predicting the Telugu vocabulary words
    model.add(TimeDistributed(Dense(telugu_vocab_size, activation='softmax')))

    # Compile the model
    model.compile(optimizer=Adam(learning_rate),
                  loss=sparse_categorical_crossentropy,
                  metrics=['accuracy'])

    return model

# Create the model with Hindi input length (85) and Telugu output length (57)
model = build_gru_model(input_length=85, output_length=59, hindi_vocab_size=hindi_vocab_size, telugu_vocab_size=telugu_vocab_size)

# Print model summary
model.summary()

# Now train the model
model.fit(X, y, batch_size=64, epochs=20, validation_split=0.2)



# Save the trained model
model.save('hindi_to_telugu_gru_model.h5')
print("Model saved as 'hindi_to_telugu_gru_model.h5'")


Epoch 1/20
[1m  2/628[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9:47:46[0m 56s/step - accuracy: 0.2176 - loss: 10.8454 

In [24]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('hindi_to_telugu_gru_model.h5')
print("Model loaded successfully.")

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'hindi_to_telugu_gru_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
def translate_hindi_to_telugu(model, hindi_sentence, hindi_tokenizer, telugu_tokenizer, input_length=85):
    """
    Translate a given Hindi sentence to Telugu using the trained GRU model.

    :param model: Trained GRU model
    :param hindi_sentence: The Hindi sentence to translate (string)
    :param hindi_tokenizer: Tokenizer used for Hindi language
    :param telugu_tokenizer: Tokenizer used for Telugu language
    :param input_length: Max length of the Hindi sentence (default is 85)
    :return: Translated Telugu sentence (string)
    """
    # Tokenize and pad the Hindi sentence
    hindi_sequence = hindi_tokenizer.texts_to_sequences([hindi_sentence])
    hindi_padded = pad_sequences(hindi_sequence, maxlen=input_length, padding='post')

    # Get the model's prediction
    predicted_logits = model.predict(hindi_padded)

    # Convert logits to Telugu sentence
    translated_sentence = logits_to_text(predicted_logits[0], telugu_tokenizer)

    return translated_sentence

def logits_to_text(logits, tokenizer):
    """
    Convert predicted logits to a human-readable sentence using the tokenizer.

    :param logits: The predicted output from the model (logits)
    :param tokenizer: The tokenizer for the target language (Telugu)
    :return: Decoded sentence (string)
    """
    index_to_word = {id: word for word, id in tokenizer.word_index.items()}
    decoded_sentence = ' '.join([index_to_word.get(np.argmax(logit), '') for logit in logits if np.argmax(logit) > 0])
    return decoded_sentence.strip()

# Example usage:
# Assuming the model has been trained and hindi_tokenizer and telugu_tokenizer are available
hindi_sentence = "आपका नाम क्या है?"  # Example Hindi sentence
translated_sentence = translate_hindi_to_telugu(model, hindi_sentence, hindi_tokenizer, telugu_tokenizer)

print("Translated Telugu sentence:", translated_sentence)
