In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = ':https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F200079%2F441417%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240425%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240425T062030Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9bc56eedd5af96acd65978cc9080b5da89869d8a53c1c5b7df172a11ac719171b19e5c01e80f045c05de4a8d4ff061516482d21801b2b106792437b2f85b94ce5b64f30c8641e803a9a034ab4868f7f85e5c818ec3b87caf257ed88c73b5eb2d9dbdaaccf30b05cfe787d086cf1b9aaf7c21bf1fc513872d0362f8b34d2866751a63b6605321f9be55e60271b9a0158310bad6da9595fa8c6ce2b48614c37dbaa7af4f31aa652645dad3048e93ca27e167054bfffdb348cc640d84f7742707419db58bb0cae16513ccfed993529a3af27dd9a5a30abcb2d47dc224904ab8c311caef281c35900d8147014f3857aa94845ac699e6cc76db2e6770df82257565ba'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading , 14623176 bytes compressed
Downloaded and uncompressed: 
Data source import complete.


In [None]:


import numpy as np
import pandas as pd

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

print(os.listdir("../input"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',0)

# Any results you write to the current directory are saved as output.

['Hindi_English_Truncated_Corpus.csv']


In [None]:
lines=pd.read_csv("../input/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [None]:
lines['source'].value_counts()

source
tides        50000
ted          39881
indic2012    37726
Name: count, dtype: int64

In [None]:
lines=lines[lines['source']=='ted']

In [None]:
lines.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what needs to be done.,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ."
1,ted,"I'd like to tell you about one such child,","मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,"
3,ted,what we really mean is that they're bad at not paying attention.,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
23,ted,This changed slowly,धीरे धीरे ये सब बदला
26,ted,were being produced.,उत्पन्न नहीं कि जाती थी.
30,ted,"And you can see, this LED is going to glow.","और जैसा आप देख रहे है, ये एल.ई.डी. जल उठेगी।"
32,ted,"to turn on the lights or to bring him a glass of water,","लाईट जलाने के लिए या उनके लिए पानी लाने के लिए,"
35,ted,Can you imagine saying that?,क्या आप ये कल्पना कर सकते है


In [None]:
pd.isnull(lines).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [None]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [None]:
lines.drop_duplicates(inplace=True)

* ### Let us pick any 25000 rows from the dataset.

In [None]:
lines=lines.sample(n=25000,random_state=42)
lines.shape

(25000, 3)

In [None]:

lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [None]:

lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation)

lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:

remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [None]:

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END


In [None]:

all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [None]:
len(all_eng_words)

14030

In [None]:
all_eng_words

{'rebellion',
 'coopt',
 'screened',
 'shelter',
 'instances',
 'pollen',
 'million',
 'intervene',
 'variety',
 'cents”',
 'um',
 'traffickers',
 'spector',
 'case”',
 'own',
 'align',
 'described',
 'coming”',
 'started',
 'occasion',
 'mindshift',
 'hut',
 'adrenaline',
 'azores',
 'defect',
 'communicating',
 'trendy',
 'mystical',
 'impaired',
 'percolator',
 'surgery',
 'lanka',
 'scaled',
 'visibility',
 'povertystricken',
 'congratulate',
 'synthesize',
 'borlaug',
 'selfdriving',
 'possibility',
 'mainstream',
 '“like',
 'accent',
 'uncapitalized',
 'sky',
 'friction',
 'subject',
 'selection',
 'invent',
 'wired',
 'torturing',
 'burglar',
 'prepaid',
 'too”',
 'vancouvers',
 'pixie”',
 'proven',
 'published',
 'resigned',
 'diamond',
 'greeks',
 'discoveries',
 'businesses',
 'become',
 'gaming',
 'calcify',
 'consolidated',
 'academically',
 'dawning',
 'pleasure',
 'buses',
 'near',
 'shiny',
 'four',
 'childlike',
 'vain',
 'incoming',
 'implement',
 'block',
 'democracie

In [None]:
len(all_hindi_words)

17540

In [None]:
all_hindi_words

{'संस्थानों',
 'कनाडा',
 'कोषाणुओं',
 'बोलने',
 'प्रबन्धन',
 'क्रम',
 'नियमो',
 'अन्याय',
 'आईआईटी',
 'जहाज',
 'pollen',
 'सुनो।',
 'ताक',
 'पढे',
 'सेण्टीमीटर',
 'स्पेक्टर',
 'सुसस्किंद',
 'आत्म',
 'मॉनिटर',
 'तीव्रबुद्धि',
 'अवसंरचना',
 'ग्रन्थ',
 'सोनो',
 'जिए',
 'मुख्यत',
 'महज',
 'प्रकाशनाधिकार',
 'मिथेन',
 'दोग्वूड्स',
 '“नैतिक',
 'चाहूँगी',
 'सान्त्वना',
 'सुगंध',
 'जुड़ती',
 'मांगो',
 'चुनें',
 'दिखने',
 'ग्रेट',
 'पूंजीवाद',
 'नामुमकिन',
 'निपटे',
 'दायीं',
 'चैलेंज',
 '“हमने',
 'बहु',
 'स्नेह',
 'कुरेद',
 'सिखायेंगी।',
 'ट्रायंगल',
 'रिपोर्टिंगसिस्टम',
 'प्रबल',
 'भविष्य',
 'बनाता',
 'इसीलिए',
 'लड़ने',
 'हवाना',
 'युवक',
 'सामयिक',
 'परमिट',
 'नर्सों',
 'रुकुंगा',
 'बदले',
 'सौदे',
 'रंजनी',
 'नज़दीक',
 'ताज़िंदगी',
 'चमचों',
 'नीला',
 'लगी',
 'मारमार',
 '“विकलांग”',
 'ब्राज़ील',
 'बग',
 'प्रमुक्ख',
 'डरावना',
 'बायोलोजिस्ट',
 'न्यूयार्क',
 'ह्म्म',
 'कीजिएगा',
 'शुगरप्लेट',
 'विस्व',
 'चुंबक।',
 'अजीबोगरीब',
 'राष्ट्रव्यापी',
 'फुटबाल',
 'प्रतिबिंबित',
 'शुरु',
 'मजमा',
 'ट

In [None]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END,11,16
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END,2,5
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END,7,8
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END,4,6
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END,16,20


In [None]:
lines[lines['length_eng_sentence']>30].shape

(0, 5)

In [None]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [None]:
lines

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है _END,11,16
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं _END,2,5
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ _END,7,8
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा _END,4,6
122330,ted,and its not as hard as you think integrate climate solutions into all of your innovations,START_ और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें _END,16,20
...,...,...,...,...,...
49566,ted,using either image recognition or marker technology,START_ छवि मान्यता या मार्कर प्रौद्योगिकी का इस्तेमाल करते हुए _END,7,11
118399,ted,and theyve started doing dna tests on kids,START_ और उन्होंने बच्चो पर dna परीक्षण शुरू कर दिये है _END,8,12
20473,ted,so there is not a lot of competition,START_ तो ज्यादा प्रतियोगिता नहीं है _END,8,7
20729,ted,a woman with indefatigable stamina,START_ एक अजेय बलवाली महिला _END,5,6


In [None]:
lines.shape

(24774, 5)

In [None]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [None]:
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [None]:
print(max_length_src)

20


In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(14030, 17540)

In [None]:
num_decoder_tokens += 1


In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
11167,ted,chris anderson that was fascinating how do you reconcile,START_ क्रिस एंडरसन यह बहुत आकर्षक था आप कैसे सामंजस्य करते है _END,9,13
82746,ted,im now going to talk about feasibility,START_ अब मैं बात करूगा व्यवहार्यता की _END,7,8
16629,ted,physical company,START_ शारीरिक साथ _END,2,4
112294,ted,into the graphic formats where you can instantly understand them,START_ जहां आप उन्हें आसानी से समझ सकते हैं में मदद करता है। _END,10,14
95010,ted,something i truly believe,START_ ये ऐसा कुछ है जिसमें मैं सच में विश्वास रखती हूँ। _END,4,13
77853,ted,is that abraham is not just a figure out of a book for those people,START_ अब्राहम महज एक किताबी किरदार नहीं है इन लोगों के लिये _END,15,13
3894,ted,its to culture that we should look,START_ हमें मार्गदर्शन सांत्वना और नैतिकता के लिये _END,7,9
112581,ted,sh well i think once you admit,START_ सैम हैरिस देखिये मैं समझता हूँ कि जब आप मानते हैं _END,7,13
81297,ted,but this story only happened a few years ago,START_ पर यह कहानी अभी कुछ साल पहले की ही है _END,9,12
118683,ted,put poison in it closed it up put it back on the shelf,START_ की उसमे विष भर दिया उसे बंद किया और उसे दराज में वापस रख दिया _END,13,17


### Split the data into train and test

In [None]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19819,), (4955,))

In [None]:
X_train

25870     youre going to drink a cup of tea                        
50164     its also understanding                                   
87547     like abraham moses                                       
108331    having a sense of awareness                              
61667     about a more sustainable future of aviation              
                             ...                                   
41740     and so patient capital also works                        
42251     but modernization actually brought communication         
52798     last year we earned about odd million dollars            
15843     a blind person driving a vehicle safely and independently
120899    there is no reason why patients doctors and nurses       
Name: english_sentence, Length: 19819, dtype: object

In [None]:
y_train

25870     START_ एक कप चाय पीयेंगे _END                                           
50164     START_ और ये समज्दारिभी है _END                                         
87547     START_ जैसे कि अब्राहम मुसा _END                                        
108331    START_ मन में जागरूकता का भाव जगाने का संबंध _END                       
61667     START_ उड्डयन के एक अधिक टिकाऊ भविष्य के बारे में । _END                
                                    ...                                           
41740     START_ और इसलिये धैर्यवान पूँजी कुछ हद तक _END                          
42251     START_ लेकिन आधुनिकीकरण वास्तव में संचार लाया _END                      
52798     START_ पिछ्ले साल हमने करीब दो करोड डॉलर की आमदनी की _END               
15843     START_ एक नेत्रहीन व्यक्ति का सुरक्षित और स्वतंत्र रूप से कार चलाना _END
120899    START_ कोई कारण नहीं है की मरीज डॉक्टरों और नर्स _END                   
Name: hindi_sentence, Length: 19819, dtype: object

### Let us save this data

In [None]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')


In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word]
                    if t>0:

                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [None]:
latent_dim=300

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

encoder_states = [state_h, state_c]

In [None]:

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            4209000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            5262300   ['input_2[0][0]']             
                                                                                              

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [None]:
model.fit(x=generate_batch(X_train, y_train, batch_size=batch_size),
          steps_per_epoch=train_samples//batch_size,
          epochs=epochs,
          validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
          validation_steps=val_samples//batch_size)


Epoch 1/100
  1/154 [..............................] - ETA: 59:09 - loss: 9.7723

InvalidArgumentError: Graph execution error:

Detected at node model/embedding/embedding_lookup defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 377, in dispatch_queue

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 250, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 748, in __init__

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-47-7d99fa8d71e3>", line 1, in <cell line: 1>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 590, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 515, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/embedding.py", line 272, in call

indices[1,0] = 14030 is not in [0, 14030)
	 [[{{node model/embedding/embedding_lookup}}]] [Op:__inference_train_function_13317]

In [None]:
model.save_weights('nmt_weights.h5')

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: youre going to drink a cup of tea
Actual Hindi Translation:  एक कप चाय पीयेंगे 
Predicted Hindi Translation:  हैं। किसी मे इस्तेमाल इस्तेमाल इस्तेमाल कर 


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: its also understanding
Actual Hindi Translation:  और ये समज्दारिभी है 
Predicted Hindi Translation:  


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: like abraham moses
Actual Hindi Translation:  जैसे कि अब्राहम मुसा 
Predicted Hindi Translation:  


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: having a sense of awareness
Actual Hindi Translation:  मन में जागरूकता का भाव जगाने का संबंध 
Predicted Hindi Translation:  एक 


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: about a more sustainable future of aviation
Actual Hindi Translation:  उड्डयन के एक अधिक टिकाऊ भविष्य के बारे में । 
Predicted Hindi Translation:  
