In [82]:
import numpy as np

In [83]:
import keras

In [84]:
import tensorflow as tf

In [85]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [86]:
inputs = []
outputs = []

data_file = open('spa.txt', encoding='utf-8')

count = 0
for line in data_file:
    count += 1
    if count > 20000:
        break
    if '\t' not in line:
        continue
    ip, temp_op, extra = line.rstrip().split('\t')
    op = temp_op
    inputs.append(ip)
    outputs.append(op)

In [87]:
inputs=[sentence.lower() for sentence in inputs]
outputs=[sentence.lower() for sentence in outputs]

In [88]:
outputs = ['<start> '+sentence+' <end>' for sentence in outputs]

In [89]:
EnglishTokenizer=Tokenizer(oov_token="<UNK>")
EnglishTokenizer.fit_on_texts(inputs)
inp_sequences=EnglishTokenizer.texts_to_sequences(inputs)
max_inp_len=max(len(i) for i in inp_sequences)
src_sequences=pad_sequences(inp_sequences,maxlen=max_inp_len,padding="post")
Englishword2index=EnglishTokenizer.word_index
Englishindex2word=EnglishTokenizer.index_word

In [90]:
SpanishTokenizer=Tokenizer(oov_token="<UNK>")
SpanishTokenizer.fit_on_texts(outputs)
op_sequences=SpanishTokenizer.texts_to_sequences(outputs)
max_tar_len=max(len(i) for i in op_sequences)
tar_sequences=pad_sequences(op_sequences,maxlen=max_tar_len,padding="post")
Spanishword2index=SpanishTokenizer.word_index
Spanishindex2word=SpanishTokenizer.index_word

In [91]:
src_vocab_size=len(Englishword2index)+1
trg_vocab_size=len(Spanishword2index)+1
print("src_vocab_size:",src_vocab_size)
print("tar_vocab_size:",trg_vocab_size)

src_vocab_size: 3771
tar_vocab_size: 7853


In [92]:
print("max_inp_len:",max_inp_len)
print("max_tar_len:",max_tar_len)

max_inp_len: 6
max_tar_len: 14


In [93]:
lstm_units=100
embed_dim=200

In [94]:
from keras import Model
from keras.layers import Bidirectional,LSTM,Input,Embedding,Dense

In [95]:
class LuongGlobalAttention(tf.keras.layers.Layer):
    def __init__(self,units,method="Dot"):
        super(LuongGlobalAttention,self).__init__()
        self.method=method
        self.w1=Dense(units,use_bias=False)
        if method=="Concat":
            self.weight = tf.Variable(initial_value=tf.zeros((units,1)), trainable=True, dtype=tf.float32)
            
    def call(self,inputs):
        encoder_op,decoder_op=inputs
        if self.method=="General":
            decoder_op=tf.transpose(decoder_op,perm=(0,2,1))
            ou1=self.w1(encoder_op)
            score=tf.matmul(ou1,decoder_op)
            
        elif self.method=="Dot":
            decoder_op=tf.transpose(decoder_op,perm=(0,2,1))
            score=tf.matmul(encoder_op,decoder_op)
            
        elif self.method=="Concat":
            concat=tf.nn.tanh(self.w1(encoder_op)+self.w1(decoder_op))
            score=tf.matmul(concat,self.weight)
            
        else:
            try:
                raise ValueError("Try valid alignment")
            except ValueError as e:
                print("Error:", e)
                return
            
        attention_weights=tf.nn.softmax(score,axis=1)
        attention_weights=tf.transpose(attention_weights,perm=(0,2,1))
        context_vector=tf.matmul(attention_weights,encoder_op)
        return context_vector

In [96]:
encoder_input=Input(shape=(max_inp_len,))
decoder_input=Input(shape=(None,))

encoder_embedding=Embedding(src_vocab_size,embed_dim)
decoder_embedding=Embedding(trg_vocab_size,embed_dim)

encoder_embed=encoder_embedding(encoder_input)
decoder_embed=decoder_embedding(decoder_input)

encoder_lstm=Bidirectional(LSTM(lstm_units,return_sequences=True,return_state=True))
encoder_op,forward_h,forward_c,backward_h,backward_c=encoder_lstm(encoder_embed)
encoder_dense=Dense(lstm_units)
h=tf.concat([forward_h,backward_h],axis=-1)
c=tf.concat([forward_c,backward_c],axis=-1)
encoder_op=encoder_dense(encoder_op)
h=encoder_dense(h)
c=encoder_dense(c)

decoder_lstm=LSTM(lstm_units,return_sequences=True,return_state=True)
decoder_op,h1,c1=decoder_lstm(decoder_embed,initial_state=[h,c])
attention=LuongGlobalAttention(lstm_units,method="General")
context_vector=attention([encoder_op,decoder_op])

decoder_op=tf.concat([context_vector, decoder_op],axis=-1)
decoder_op=tf.nn.tanh(decoder_op)
decoder_dense=Dense(trg_vocab_size,activation='softmax')
decoder_op=decoder_dense(decoder_op)

model=Model([encoder_input,decoder_input],[decoder_op])

In [97]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 6, 200)               754200    ['input_6[0][0]']             
                                                                                                  
 bidirectional_1 (Bidirecti  [(None, 6, 200),             240800    ['embedding_2[0][0]']         
 onal)                        (None, 100),                                                        
                              (None, 100),                                                        
                              (None, 100),                                                  

In [98]:
src_sequences.shape,tar_sequences.shape

((20000, 6), (20000, 14))

In [99]:
from sklearn.model_selection import train_test_split

In [108]:
X_train,X_test,y_train,y_test=train_test_split(src_sequences,tar_sequences,test_size=0.05)

In [109]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((19000, 6), (1000, 6), (19000, 14), (1000, 14))

In [110]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [111]:
y_test_onehot=tf.one_hot(y_test,trg_vocab_size)

In [112]:
y_train_onehot=tf.one_hot(y_train,trg_vocab_size)

In [113]:
model.fit([X_train,y_train],y_train_onehot,batch_size=32,epochs=5,validation_data=([X_test,y_test],y_test_onehot))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x211af268be0>

In [114]:
model.save_weights("luong.h5")

In [115]:
encoder_model=Model(encoder_input,[encoder_op,h,c])

In [116]:
encoder_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 6, 200)               754200    ['input_6[0][0]']             
                                                                                                  
 bidirectional_1 (Bidirecti  [(None, 6, 200),             240800    ['embedding_2[0][0]']         
 onal)                        (None, 100),                                                        
                              (None, 100),                                                        
                              (None, 100),                                                  

In [117]:
decoder_model=Model([decoder_input,encoder_op,h,c],[decoder_op,h1,c1])

In [118]:
decoder_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, None, 200)            1570600   ['input_7[0][0]']             
                                                                                                  
 input_9 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, 100)]                0         []                            
                                                                                            

In [119]:
model.evaluate([X_test,y_test],y_test_onehot)



[0.9746941924095154, 0.8709999918937683]

In [276]:
sentences=inputs[900]

In [277]:
sentences

'tom tried.'

In [278]:
sequences=EnglishTokenizer.texts_to_sequences([sentences])

In [279]:
sequences

[[3, 425]]

In [280]:
seq=pad_sequences(sequences,maxlen=max_inp_len,padding="post")

In [281]:
seq.shape

(1, 6)

In [282]:
def predict_encoder(seq):
    return encoder_model.predict(seq)

In [283]:
def predict_decoder(tar_seq, encoder_op, h, c):
    return decoder_model.predict([tar_seq, encoder_op, h, c])

In [None]:
out1 = []
encoder_op, h, c = predict_encoder(seq)
stop_condition = False
tar_seq = np.array([[Spanishword2index["start"]]])

while not stop_condition:
    decoder_op, h1, c1 = predict_decoder(tar_seq, encoder_op, h, c)
    index = np.argmax(decoder_op[0, -1, :])

    char = Spanishindex2word.get(index, "")
    
    if char == 'end':
        stop_condition = True
    elif char != 'start':  
        out1.append(char)

    tar_seq = np.array([[index]])
    h = h1
    c = c1

print(' '.join(out1))





In [295]:
out1 = []
encoder_op, h, c = predict_encoder(seq)
stop_condition = False
tar_seq = np.array([[Spanishword2index["start"]]])

while not stop_condition:
    decoder_op, h1, c1 = predict_decoder(tar_seq, encoder_op, h, c)
    index = np.argmax(decoder_op[0, -1, :])
    if index==0:
        break
    char = Spanishindex2word[index]
    out1.append(char)

    if char == 'end':
        stop_condition = True

    tar_seq = np.array([[index]])
    h = h1
    c = c1
print(' '.join(out1))

start ¿es ahora ahora eso eso eso


In [290]:
def beam_search(encoder_model, decoder_model, seq, k=3):
    def top_k_indices(predictions, k):
        return np.argpartition(-predictions, k)[:k]

    out = []
    encoder_op, h, c = encoder_model.predict(seq)
    stop_condition = False
    tar_seq = np.array([[Spanishword2index["start"]]])

    while not stop_condition:
        all_candidates = []
        decoder_op, h1, c1 = decoder_model.predict([tar_seq, encoder_op, h, c])

        # Get top-k indices for the next token
        top_k = top_k_indices(decoder_op[0, -1, :], k)

        for index in top_k:
            word = Spanishindex2word.get(index, None)
            if word is not None and word not in ['start', 'end']:
                candidate_seq = out + [word]
                candidate_prob = decoder_op[0, -1, index]
                all_candidates.append((candidate_seq, candidate_prob))

        # Sort candidates by probability (in decreasing order)
        all_candidates.sort(key=lambda x: x[1], reverse=True)

        # Keep only the top-k candidates
        all_candidates = all_candidates[:k]

        # Update current output and check stop condition
        out, next_prob = all_candidates[0]
        if 'end' in out or len(out) > 6:
            stop_condition = True

        # Prepare inputs for the next decoding step
        tar_seq = np.array([[Spanishword2index[word] for word in out]])
        h, c = h1, c1

    return ' '.join(out)

# Call the beam_search function with the encoder and decoder models
beam_width = 3
output_sequence = beam_search(encoder_model, decoder_model, seq, k=beam_width)
print("Beam Search Output:", output_sequence)


Beam Search Output: ¿cómo ¿cómo esto esto esto esto esto


In [291]:
outputs[900]

'<start> tomás lo intentó. <end>'

## Colab developed model

In [None]:
encoder_input=Input(shape=(max_inp_len,))
decoder_input=Input(shape=(None,))

encoder_embedding=Embedding(src_vocab_size,embed_dim)
decoder_embedding=Embedding(trg_vocab_size,embed_dim)

encoder_embed=encoder_embedding(encoder_input)
decoder_embed=decoder_embedding(decoder_input)

encoder_lstm=Bidirectional(LSTM(lstm_units,return_sequences=True,return_state=True))
encoder_op,forward_h,forward_c,backward_h,backward_c=encoder_lstm(encoder_embed)
encoder_dense=Dense(lstm_units)
encoder_dropout=Dropout(rate=dropout_rate)
h=tf.concat([forward_h,backward_h],axis=-1)
c=tf.concat([forward_c,backward_c],axis=-1)
encoder_op=encoder_dense(encoder_op)
encoder_op=encoder_dropout(encoder_op)
h=encoder_dense(h)
c=encoder_dense(c)

decoder_lstm=LSTM(lstm_units,return_sequences=True,return_state=True)
decoder_op,h1,c1=decoder_lstm(decoder_embed,initial_state=[h,c])
decoder_dropout=Dropout(rate=dropout_rate)
decoder_op=decoder_dropout(decoder_op)
attention=LuongGlobalAttention(lstm_units,method="General")
context_vector=attention([encoder_op,decoder_op])

decoder_op=tf.concat([context_vector, decoder_op],axis=-1)
decoder_op=tf.nn.tanh(decoder_op)
decoder_dense=Dense(trg_vocab_size,activation='softmax')
decoder_op=decoder_dense(decoder_op)

model=Model([encoder_input,decoder_input],[decoder_op])