In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np


import random
import math
import time

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [2]:
with open('../input/englishrussian-dictionary-for-machine-translate/rus.txt') as f:
    text = f.read()

In [5]:
import string

def preprocess_lines(text):
    text = ''.join(i for i in text if i not in string.punctuation)
    return text.lower()

In [6]:
maxlen_phrase_encoder = 0
maxlen_phrase_decoder = 0

def preprocess_data(text, count_lines=70000):
    global maxlen_phrase_encoder, maxlen_phrase_decoder
    
    text_lines=text.split('\n')
    
    english_texts, russian_texts = [], []
    english_words, russian_words = set() ,set()
    
    for i in range(count_lines):
        preprocessed_text_line = preprocess_text(text_lines[i])
        preprocessed_text_line = preprocessed_text_line.split('\t')
        
        english_text, russian_text = preprocessed_text_linet[0], '<sos> ' + preprocessed_text_line[1] + ' <eos>'
        
        english_texts.append(english_text)
        maxlen_phrase_encoder = max(maxlen_phrase_encoder, len(english_text))
        
        russian_texts.append(russian_text)
        maxlen_phrase_decoder = max(maxlen_phrase_decoder, len(russian_text))
        
        for english_word in english_text.split():
            if english_word not in english_words:
                english_words.add(english_word)
                
        for russian_word in russian_text.split():
            if russian_word not in russian_words:
                russian_words.add(russian_word)
                
    english_words.sort()
    russian_words.sort()
    return english_texts, russian_texts, english_words, russian_words

In [7]:
english_texts,russian_texts,english_words,russian_words = preprocess_data(text)
english_words.append(' ')
russian_words.append(' ')

In [8]:
data = pd.DataFrame({'English':english_texts, 'Russian':russian_texts})
data

Unnamed: 0,English,Russian
0,go,<sos> марш <eos>
1,go,<sos> иди <eos>
2,go,<sos> идите <eos>
3,hi,<sos> здравствуйте <eos>
4,hi,<sos> привет <eos>
...,...,...
69995,were unimaginative,<sos> у нас нет воображения <eos>
69996,were very flexible,<sos> мы очень гибкие <eos>
69997,were very grateful,<sos> мы весьма благодарны <eos>
69998,were very grateful,<sos> мы очень признательны <eos>


In [10]:
russian_word_to_key = {word:number for number, word in enumerate(russian_words)}
english_word_to_key = {word:number for number, word in enumerate(english_words)}

russian_key_to_word = {number:word for number, word in enumerate(russian_words)}
english_key_to_word = {number:word for number, word in enumerate(english_words)}


In [11]:
def new_batch(X, y, batch_size=64):
    
    while True:
        for i in range(len(X)// batch_size):
            l, r = i * batch_size, (i + 1) * batch_size
            encoder_input = np.zeros((batch_size, maxlen_phrase_encoder), dtype='float32')
            decoder_input = np.zeros((batch_size, maxlen_phrase_decoder), dtype='float32')
            decoder_output = np.zeros((batch_size, maxlen_phrase_decoder, count_decoder_tokens), dtype='float32')

            
            for j in range(l, r):
                encoder_text = X[j].split()
                decoder_text = y[j].split()
                for k in range(len(encoder_text)):
                    encoder_input[j - l][k] = english_word_to_key.get(encoder_text[k], english_word_to_key[' '])
                for k in range(len(decoder_text)):
                    if k < len(decoder_text) - 1:
                        decoder_input[j - l][k] = russian_word_to_key.get(decoder_text[k], russian_word_to_key[' '])
                    if (k > 0):
                        decoder_output[j - l][k - 1][russian_word_to_key.get(decoder_text[k],
                                                                            russian_word_to_key[' '])] = 1

            yield ([encoder_input,decoder_input],decoder_output)
    

In [12]:
hidden_dim=512
embedding_dim=256
batch_size = 64

count_encoder_tokens = len(english_words)
count_decoder_tokens = len(russian_words) + 1

In [13]:
encoder_inputs = layers.Input(shape=(None, ))
encoder_embedding_layer = layers.Embedding(input_dim=count_encoder_tokens, output_dim=embedding_dim, mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_inputs)
encoder_lstm = layers.Bidirectional(layers.LSTM(units=hidden_dim, return_state=True))
_,encoder_hidden_state1,encoder_cell_state1,encoder_hidden_state2,encoder_cell_state2=encoder_lstm(encoder_embeddings)
encoder_state=[encoder_hidden_state1 + encoder_hidden_state2,encoder_cell_state1 + encoder_cell_state2]


decoder_inputs = layers.Input(shape=(None, ))
decoder_embedding_layer = layers.Embedding(input_dim=count_decoder_tokens, output_dim=embedding_dim, mask_zero=True)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
decoder_lstm = layers.LSTM(units=hidden_dim, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_state)
output_layer = layers.Dense(units=count_decoder_tokens)
decoder_output = output_layer(decoder_output)
output = tf.nn.softmax(decoder_output)


model = tf.keras.models.Model(inputs=(encoder_inputs, decoder_inputs), outputs=output)




In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    1528576     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 1024),       3149824     ['embedding[0][0]']              
                                 (None, 512),                                                 

In [15]:

model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['categorical_crossentropy'])

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = [arr.to_numpy() for arr in train_test_split(data['English'], data['Russian'], test_size=0.07, random_state=31)]

In [None]:
history=model.fit(
    x=new_batch(X_train,y_train),
    validation_data=new_batch(X_val,y_val),
    batch_size=batch_size,
    epochs=6,
    steps_per_epoch=len(X_train)//batch_size,
    validation_steps=len(X_val)//batch_size
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6

In [None]:
encoder_model=tf.keras.models.Model(inputs=encoder_inputs,outputs=encoder_state)

In [None]:
decoder_hidden_state=layers.Input(shape=(hidden_dim,))
decoder_cell_state=layers.Input(shape=(hidden_dim,))
decoder_init_state=[decoder_hidden_state,decoder_cell_state]
decoder_embeddings=decoder_embedding_layer(decoder_inputs)
decoder_output,decoder_output_hidden_state,decoder_output_cell_state=decoder_lstm(decoder_embeddings
                                                                    ,initial_state=decoder_init_state)
decoder_final_state=[decoder_output_hidden_state,decoder_output_cell_state]
decoder_output=output_layer(decoder_output)
decoder_probs=tf.nn.softmax(decoder_output)
decoder_model=tf.keras.models.Model(inputs=[decoder_inputs]+decoder_init_state
                                    ,outputs=[decoder_probs]+decoder_final_state)

In [None]:
def generate_text(text):
    translation=""
    states_value=encoder_model(text)
    target=np.zeros((1,1))
    target[0,0]=russian_word_to_key['<sos>']
    stop_condition=False
    while not stop_condition:
        output_token,hidden_state,cell_state=decoder_model([target]+states_value)
        char_index=np.argmax(output_token[0,-1,:])
        char=russian_key_to_word[char_index]
        if char=='<eos>' or len(translation)>=maxlen_phrase_decoder:
            stop_condition=True
            continue
        translation+=' '+char
        states_value=[hidden_state,cell_state]
        target[0,0]=russian_word_to_key[char]
    return translation

In [None]:
model.save('/kaggle/working/')


In [None]:
encoder_model.save('/kaggle/working/encoder')
decoder_model.save('/kaggle/working/decoder')