In [1]:
import keras

In [2]:
import numpy as np
import tensorflow as tf

In [3]:
from tensorflow.keras.layers import Input,GRU,Dense,Embedding,Bidirectional

In [4]:
from keras.models import Model

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [139]:
inputs = []
outputs = []

data_file = open('spa.txt', encoding='utf-8')

count = 0
for line in data_file:
    count += 1
    if count > 7000:
        break
    if '\t' not in line:
        continue
    ip, temp_op, extra = line.rstrip().split('\t')
    op = temp_op
    inputs.append(ip)
    outputs.append(op)

In [140]:
inputs1 = []
outputs1 = []

data_file = open('spa.txt', encoding='utf-8')

count1 = 4000
for line in data_file:
    count1 += 1
    if count1 > 4200:
        break
    if '\t' not in line:
        continue
    ip1, temp_op1, extra1 = line.rstrip().split('\t')
    op1 = temp_op1
    inputs1.append(ip)
    outputs1.append(op)

In [141]:
inputs1=[sentence.lower() for sentence in inputs1]
outputs1=[sentence.lower() for sentence in outputs1]

In [142]:
inputs=[sentence.lower() for sentence in inputs]
outputs=[sentence.lower() for sentence in outputs]

In [143]:
outputs = ["<start>"+" "+sentence+" "+"<end>" for sentence in outputs]

In [144]:
outputs1 = ["<start>"+" "+sentence+" "+"<end>" for sentence in outputs1]

In [145]:
EnglishTokenizer=Tokenizer()
EnglishTokenizer.fit_on_texts(inputs)
inp_sequences=EnglishTokenizer.texts_to_sequences(inputs)
max_inp_len=max(len(i) for i in inp_sequences)
src_sequences=pad_sequences(inp_sequences,maxlen=max_inp_len,padding="post")
Englishword2index=EnglishTokenizer.word_index
Englishindex2word=EnglishTokenizer.index_word

In [146]:
test_inp_sequences=EnglishTokenizer.texts_to_sequences(inputs1)
test_inp_sequences=pad_sequences(test_inp_sequences,maxlen=max_inp_len,padding="post")

In [147]:
SpanishTokenizer=Tokenizer()
SpanishTokenizer.fit_on_texts(outputs)
op_sequences=SpanishTokenizer.texts_to_sequences(outputs)
max_tar_len=max(len(i) for i in op_sequences)
tar_sequences=pad_sequences(op_sequences,maxlen=max_tar_len,padding="post")
Spanishword2index=SpanishTokenizer.word_index
Spanishindex2word=SpanishTokenizer.index_word

In [148]:
test_out_sequences=SpanishTokenizer.texts_to_sequences(outputs1)
test_out_sequences=pad_sequences(test_out_sequences,maxlen=max_tar_len,padding="post")

In [149]:
src_vocab_size=len(Englishword2index)+1
trg_vocab_size=len(Spanishword2index)+1
print("src_vocab_size:",src_vocab_size)
print("tar_vocab_size:",trg_vocab_size)

src_vocab_size: 1807
tar_vocab_size: 3913


In [150]:
test_inp_sequences.shape,test_out_sequences.shape

((200, 5), (200, 10))

In [151]:
print("max_inp_len:",max_inp_len)
print("max_tar_len:",max_tar_len)

max_inp_len: 5
max_tar_len: 10


In [152]:
gru_units=100
embed_size=100

## Word Embedding(Glove)

In [153]:
from numpy import asarray, zeros

embeddings_dict = dict()

glove_file = open('glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector = asarray(records[1:], dtype='float32')
    embeddings_dict[word] = vector

glove_file.close()

embedding_matrix = zeros((src_vocab_size,embed_size))

for word, index in Englishword2index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [154]:
embedding_matrix.shape

(1807, 100)

## Bahdanau Attention 

In [155]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, inputs):
        query,values=inputs
        query_with_time_axis = tf.expand_dims(query, 1)
        score1=self.W1(query_with_time_axis)
        score2=self.W2(values)
        combined_score=tf.nn.tanh(score1 + score2)
        score = self.V(combined_score)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

## Encoder Decoder Architecture with attention

In [156]:
class EncoderDecoder(tf.keras.Model):
    def __init__(self,max_inp_len,max_tar_len,inp_vocab_size,tar_vocab_size,gru_units,embed_size,embedding_matrix):
        super(EncoderDecoder,self).__init__()
        self.encoder_embed=Embedding(inp_vocab_size,embed_size,input_length=max_inp_len,weights=[embedding_matrix])
        
        self.encoder_gru=Bidirectional(GRU(gru_units,return_sequences=True,return_state=True))
        self.encoder_fc=Dense(gru_units)
        
        self.decoder_embed=Embedding(tar_vocab_size,embed_size,input_length=max_tar_len)
        self.decoder_gru=GRU(gru_units,return_sequences=True,return_state=True)
        self.decoder_fc=Dense(tar_vocab_size,activation="softmax")
        self.attention=BahdanauAttention(gru_units)
        
    def call(self,inputs):
        encoder_input,decoder_input=inputs
        encoder_embed=self.encoder_embed(encoder_input)
        encoder_op,forward,backward=self.encoder_gru(encoder_embed)
        hidden = tf.tanh(self.encoder_fc(tf.concat([forward,backward], axis = -1)))
        

        decoder_embed=self.decoder_embed(decoder_input)
        context_vector=self.attention([hidden,encoder_op])
        context_vector=tf.expand_dims(context_vector,1)
        context_vector=tf.tile(context_vector,[1,tf.shape(decoder_embed)[1],1])
        encoder_op=tf.transpose(encoder_op,perm=(0,2,1))
        decoder_combined=tf.matmul(context_vector,encoder_op)
                
        
        weighted=tf.concat([decoder_embed, context_vector],axis=2)
        
        decoder_op,_=self.decoder_gru(weighted,initial_state=hidden)
        
        ouput=tf.concat([decoder_op,decoder_combined,decoder_embed],axis=2)
        
        dec_op=self.decoder_fc(ouput)
        
        return dec_op

In [157]:
model=EncoderDecoder(max_inp_len,max_tar_len,src_vocab_size,trg_vocab_size,gru_units,embed_size,embedding_matrix)

In [158]:
model.compile(optimizer='adam',loss="categorical_crossentropy",metrics=['accuracy'])

In [159]:
tar_one_hot=tf.one_hot(tar_sequences,depth=trg_vocab_size)

In [160]:
model.fit([src_sequences,tar_sequences],tar_one_hot,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x17ac7ce8280>

In [161]:
model.summary()

Model: "encoder_decoder_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     multiple                  180700    
                                                                 
 bidirectional_2 (Bidirecti  multiple                  121200    
 onal)                                                           
                                                                 
 dense_43 (Dense)            multiple                  20100     
                                                                 
 embedding_5 (Embedding)     multiple                  391300    
                                                                 
 gru_5 (GRU)                 multiple                  120600    
                                                                 
 dense_44 (Dense)            multiple                  806078    
                                                 

In [162]:
test_one_hot=tf.one_hot(test_out_sequences,depth=trg_vocab_size)

In [163]:
loss,accuracy=model.evaluate([test_inp_sequences,test_out_sequences],test_one_hot)



In [164]:
new_english_text = "what is my name."

In [165]:
new_english_text=new_english_text+" "+"end"

In [166]:
sample=EnglishTokenizer.texts_to_sequences([new_english_text])
sample=pad_sequences(sample,maxlen=max_inp_len,padding="post")

In [167]:
sample

array([[  60,    5,   26, 1120,    0]])

In [172]:
Spanishindex2word[100]

'todo'

In [174]:
trans = []
tar_seq1 = np.array([[Spanishword2index['start']]])

for i in range(max_tar_len):
    translated1 = model.predict([sample, tar_seq1])
    p = np.argmax(translated1[0, 0, :])
    translated_text = Spanishindex2word[p]
    trans.append(translated_text)
    tar_seq1 = np.array([[p]])  
    
    sample = np.append(sample, p)
    
    if translated_text == '<end>': 
        break

print("Translated sentence:")
print(" ".join(trans))




ValueError: Data cardinality is ambiguous:
  x sizes: 6, 1
Make sure all arrays contain the same number of samples.

In [173]:
trans = []
tar_seq1 = np.array([[Spanishword2index['todo']]])

for i in range(max_tar_len):
    translated1 = model.predict([sample, tar_seq1])
    p = np.argmax(translated1[0, 0, :])
    translated_text = Spanishindex2word[p]
    trans.append(translated_text)
    tar_seq1 = np.array([[p]]) 
    if translated_text == 'end': 
        break

print("Translated sentence:")
print(" ".join(trans))


Translated sentence:
todo todo todo todo todo todo todo todo todo todo


In [170]:
trans

['start',
 'start',
 'start',
 'start',
 'start',
 'start',
 'start',
 'start',
 'start',
 'start']

In [105]:
encoder_inputs=Input(shape=(max_inp_len,))
encoder_embed=model.layers[0](encoder_inputs)
encoder_op,forward,backward=model.layers[1](encoder_embed)
hidden=tf.nn.tanh(model.layers[2]((tf.concat([forward,backward], axis = -1))))
encoder_model=Model(encoder_inputs,[encoder_op,hidden])

In [106]:
encoder_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 4)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 4, 100)               97800     ['input_9[0][0]']             
                                                                                                  
 bidirectional (Bidirection  [(None, 4, 200),             121200    ['embedding[1][0]']           
 al)                          (None, 100),                                                        
                              (None, 100)]                                                        
                                                                                            

In [112]:
decoder_inputs=Input(shape=(max_tar_len,))
decoder_embed=model.layers[3](decoder_inputs)
print("decoder_embed:",decoder_embed.shape)
attention=BahdanauAttention(gru_units)
context_vector=attention([hidden,encoder_op])
context_vector=tf.expand_dims(context_vector,1)
context_vector=tf.tile(context_vector,[1,tf.shape(decoder_embed)[1],1])
print("context_vector:",context_vector.shape)
encoder_op=tf.transpose(encoder_op,perm=(0,2,1))
print("encoder_op:",encoder_op.shape)
decoder_combined=tf.matmul(context_vector,encoder_op)
print("decoder_combined:",decoder_combined.shape)
weighted=tf.concat([decoder_embed, context_vector],axis=2)
print("weighted:",weighted.shape)
decoder_op,_=model.layers[4](weighted,initial_state=hidden)
print("decoder_op:",decoder_op.shape)
ouput=tf.concat([decoder_op,decoder_combined,decoder_embed],axis=2)
print("ouput:",ouput.shape)
dec_op=model.layers[5](ouput)
print("dec_op:",dec_ouput.shape)

decoder_model=Model([decoder_inputs,encoder_op,hidden],dec_op)

decoder_embed: (None, 9, 100)
context_vector: (None, 9, 4)
encoder_op: (None, 4, 200)
decoder_combined: (None, 9, 200)
weighted: (None, 9, 104)


ValueError: Exception encountered when calling layer "gru_1" (type GRU).

Dimensions must be equal, but are 104 and 300 for '{{node MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false](strided_slice_1, kernel)' with input shapes: [?,104], [300,300].

Call arguments received by layer "gru_1" (type GRU):
  • inputs=['tf.Tensor(shape=(None, 9, 104), dtype=float32)', 'tf.Tensor(shape=(None, 100), dtype=float32)']
  • mask=None
  • training=None
  • initial_state=None

In [109]:
decoder_model.summary()

NameError: name 'decoder_model' is not defined

In [91]:
Spanishword2index["start"]

1

In [115]:
trans=[]
for i in range(max_tar_len):
    translated1= model.predict([sample ,tar_seq1])
    p=np.argmax(translated1[0,0,:])
    translated_text=Spanishindex2word[p]
    trans.append(translated_text)
    tar_seq1=np.array([[Spanishword2index[translated_text]]])
    print(tar_seq1)
    trans.append(translated_text)

[[36]]
[[36]]
[[36]]
[[36]]
[[36]]
[[36]]
[[36]]
[[36]]
[[36]]


In [116]:
print(trans)

['¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es', '¿es']


In [62]:
translated1= model.predict([sample ,tar_seq1])



In [74]:
p=np.argmax(translated1[0,0,:])

In [64]:
translated1.shape

(1, 1, 2223)

In [72]:
count=1.0864681e-05
for i in translated1[0,0,:]:
    if i>count:
        count=i

In [76]:
Spanishindex2word[p]

'start'

In [41]:
translatedtext 

['']

In [77]:
Englishword2index["start"]

172