In [3]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import re
import random

In [4]:
text_file_path = 'rus.txt'
with open(text_file_path, encoding='utf-8') as t:
    text = t.read()

In [5]:
def preprocess_text(text):
    text=re.sub("'",'',text)
    text=''.join(char for char in text if char not in string.punctuation)
    text=re.sub("[0-9]",'',text)
    return text.lower()

In [6]:
def return_sentences(text,num_lines=20000):
    text_lines=text.split('\n')
    english_texts,russian_texts=[],[]
    english_words,russian_words=set(),set()
    for text_line in tqdm(range(min(len(text_lines),num_lines))):
        preprocessed_text_line=preprocess_text(text_lines[text_line])
        tab_split_text=preprocessed_text_line.split('\t')
        english_text,russian_text=tab_split_text[0],'<sos> '+tab_split_text[1]+' <eos>'
        english_texts.append(english_text)
        russian_texts.append(russian_text)
        for english_word in english_text.split():
            if english_word not in english_words:
                english_words.add(english_word)
        for russian_word in russian_text.split():
            if russian_word not in russian_words:
                russian_words.add(russian_word)
    english_words=sorted(list(english_words))
    russian_words=sorted(list(russian_words))
    return english_texts,russian_texts,english_words,russian_words

In [7]:
english_texts,russian_texts,english_words,russian_words=return_sentences(text)

100%|██████████| 20000/20000 [00:00<00:00, 42458.08it/s]


In [8]:
text_map={'English sentences':english_texts,'Russian sentences':russian_texts}

In [9]:
def dataframe_text(text_map):
    text_df=pd.DataFrame(text_map,columns=text_map.keys())
    for key in text_map.keys():
        text_df[key+' length']=text_df[key].apply(lambda text:len(text.split()))
    text_df=text_df.sample(frac=1)
    return text_df

In [10]:
text_df=dataframe_text(text_map)
text_df.head()

Unnamed: 0,English sentences,Russian sentences,English sentences length,Russian sentences length
15839,just wait here,<sos> просто подожди здесь <eos>,3,5
5784,give it back,<sos> верни это <eos>,3,4
19659,he made a robot,<sos> он создал робота <eos>,4,5
8568,are you awake,<sos> ты не спишь <eos>,3,5
18775,call the doctor,<sos> вызови врача <eos>,3,4


In [11]:
encoder_seq_length=max(text_df['English sentences length'])
decoder_seq_length=max(text_df['Russian sentences length'])

In [12]:
num_encoder_tokens=len(english_words)
num_decoder_tokens=len(russian_words)+1

In [13]:
english_lookup_table={word:num for num,word in enumerate(english_words)}
russian_lookup_table={word:num+1 for num,word in enumerate(russian_words)}

In [14]:
english_token_lookup_table={num:word for word,num in english_lookup_table.items()}
russian_token_lookup_table={num:word for word,num in russian_lookup_table.items()}

In [15]:
def generate_batch(X,y,batch_size=32):
    while True:
        for i in range(0,len(X),batch_size):
            encoder_input_vector=np.zeros((batch_size,encoder_seq_length),dtype=np.float32)
            decoder_input_vector=np.zeros((batch_size,decoder_seq_length),dtype=np.float32)
            decoder_target_vector=np.zeros((batch_size,decoder_seq_length,num_decoder_tokens),dtype=np.float32)
            for j,(encoder_text,decoder_text) in enumerate(zip(X[i:i+batch_size],y[i:i+batch_size])):
                for time_step,encoder_word in enumerate(encoder_text.split()):
                    encoder_input_vector[j,time_step]=english_lookup_table[encoder_word]
                for time_step,decoder_word in enumerate(decoder_text.split()):
                    if time_step<len(decoder_text.split())-1:
                        decoder_input_vector[j,time_step]=russian_lookup_table[decoder_word]
                    if time_step>0:
                                              decoder_target_vector[j,time_step-1,russian_lookup_table[decoder_word]]=1
            yield ([encoder_input_vector,decoder_input_vector],decoder_target_vector)

In [16]:
X=list(text_df['English sentences'])
y=list(text_df['Russian sentences'])

In [17]:
X_train=X[:len(X)*80//100]
y_train=y[:len(y)*80//100]
X_valid=X[80*len(X)//100:]
y_valid=y[80*len(y)//100:]

In [18]:
latent_dim=512
embedding_dim=256
batch_size=32

In [19]:
encoder_inputs=layers.Input(shape=(None,))
encoder_embedding_layer=layers.Embedding(
    input_dim=num_encoder_tokens,output_dim=embedding_dim,
    mask_zero=True
)
encoder_embeddings=encoder_embedding_layer(encoder_inputs)
encoder_lstm=layers.Bidirectional(layers.LSTM(units=latent_dim,return_state=True))
_,encoder_hidden_state1,encoder_cell_state1,encoder_hidden_state2,encoder_cell_state2=encoder_lstm(encoder_embeddings)
encoder_state=[encoder_hidden_state1+encoder_hidden_state2,encoder_cell_state1+encoder_cell_state2]



In [20]:
decoder_inputs=layers.Input(shape=(None,))
decoder_embedding_layer=layers.Embedding(
    input_dim=num_decoder_tokens,output_dim=embedding_dim,
    mask_zero=True
)
decoder_embeddings=decoder_embedding_layer(decoder_inputs)
decoder_lstm=layers.LSTM(units=latent_dim,return_sequences=True,return_state=True)
decoder_output,_,_=decoder_lstm(decoder_embeddings,initial_state=encoder_state)
output_layer=layers.Dense(units=num_decoder_tokens)
decoder_output=output_layer(decoder_output)
output_probs=tf.nn.softmax(decoder_output)

In [21]:
model=tf.keras.models.Model(inputs=(encoder_inputs,decoder_inputs),outputs=output_probs)

In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    702976      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 1024),       3149824     ['embedding[0][0]']              
                                 (None, 512),                                                 

In [23]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

In [24]:
history=model.fit(
    x=generate_batch(X_train,y_train),
    validation_data=generate_batch(X_valid,y_valid),
    batch_size=batch_size,
    epochs=10,
    steps_per_epoch=len(X_train)//batch_size,
    validation_steps=len(X_valid)//batch_size
)



In [25]:
encoder_model=tf.keras.models.Model(inputs=encoder_inputs,outputs=encoder_state)

In [27]:
decoder_hidden_state=layers.Input(shape=(latent_dim,))
decoder_cell_state=layers.Input(shape=(latent_dim,))
decoder_init_state=[decoder_hidden_state,decoder_cell_state]
decoder_embeddings=decoder_embedding_layer(decoder_inputs)
decoder_output,decoder_output_hidden_state,decoder_output_cell_state=decoder_lstm(decoder_embeddings
                                                                    ,initial_state=decoder_init_state)
decoder_final_state=[decoder_output_hidden_state,decoder_output_cell_state]
decoder_output=output_layer(decoder_output)
decoder_probs=tf.nn.softmax(decoder_output)
decoder_model=tf.keras.models.Model(inputs=[decoder_inputs]+decoder_init_state
                                    ,outputs=[decoder_probs]+decoder_final_state)

In [28]:
encoder_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    702976      ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  [(None, 1024),       3149824     ['embedding[0][0]']              
                                 (None, 512),                                                     
                                 (None, 512),                                                     
                                 (None, 512),                                               

In [29]:
def generate_text(text):
    translation=""
    states_value=encoder_model(text)
    target=np.zeros((1,1))
    target[0,0]=russian_lookup_table['<sos>']
    stop_condition=False
    while not stop_condition:
        output_token,hidden_state,cell_state=decoder_model([target]+states_value)
        char_index=np.argmax(output_token[0,-1,:])
        char=russian_token_lookup_table[char_index]
        if char=='<eos>' or len(translation)>=decoder_seq_length:
            stop_condition=True
            continue
        translation+=' '+char
        states_value=[hidden_state,cell_state]
        target[0,0]=russian_lookup_table[char]
    return translation

In [30]:
text_gen=generate_batch(X_valid,y_valid,batch_size=1)
text_gen
k=-1

In [31]:
k+=1
[encoder_inputs,decoder_inputs],decoder_target=next(text_gen)
print(f'Input sentence: {X_valid[k:k+1][0]}')
print(f'Actual translation: {y_valid[k:k+1][0][5:-5]}')
print(f"Model's translation: {generate_text(encoder_inputs)}" )

Input sentence: bring him to me
Actual translation:  приводи его ко мне 
Model's translation:  я
