# Language Translation Using Recurring Neural Networks

## Preprocessing and Model

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

### Data Loading and Preprocessing

In [None]:
from google.colab import files

uploads = files.upload()

Saving EN_DE.csv to EN_DE (1).csv


In [None]:
# Load language datasets
file = "EN_DE.csv"
df = pd.read_csv(file)

In [None]:
df.head()

Unnamed: 0,EN,DE,ENCounter,DECounter,CountDiff
0,"['commission', 'regulation', '(ec)', 'no', '17...","['verordnung', '(eg)', 'nr.', '1788/2004', 'de...",5,6,1
1,"['of', '15', 'october', '2004']","['vom', '15.', 'oktober', '2004']",4,4,0
2,"['fixing', 'the', 'minimum', 'selling', 'price...","['zur', 'festsetzung', 'der', 'mindestverkaufs...",20,17,3
3,"['the', 'commission', 'of', 'the', 'european',...","['die', 'kommission', 'der', 'europäischen', '...",5,6,1
4,"['having', 'regard', 'to', 'the', 'treaty', 'e...","['gestützt', 'auf', 'den', 'vertrag', 'zur', '...",8,9,1


In [None]:
df_test = df.sample(frac=0.15, replace=True, random_state=1)

In [None]:
df_test.shape

(7500, 5)

In [None]:
df_test.head()

Unnamed: 0,EN,DE,ENCounter,DECounter,CountDiff
33003,"['these', 'aggregate', 'values', 'shall', 'be'...","['sie', 'teilen', 'diese', 'gesamtwerte', 'der...",9,7,2
12172,['whereas:'],"['in', 'erwägung', 'nachstehender', 'gründe:']",1,4,3
5192,"['commission', 'regulation', '(ec)', 'no', '19...","['die', 'verordnung', '(eg)', 'nr.', '1961/200...",20,18,2
32511,"['whether', 'the', 'products', 'concerned', 'a...","['angabe,', 'ob', 'es', 'sich', 'bei', 'den', ...",10,15,5
43723,"['complexul', 'de', 'porci', 'brăila', 'sa', '...","['complexul', 'de', 'porci', 'brăila', 'sa', '...",11,10,1


In [None]:
cols = ['ENCounter', 'DECounter', 'CountDiff']
df_test[cols] = df_test[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7500 entries, 33003 to 41347
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   EN         7500 non-null   object
 1   DE         7500 non-null   object
 2   ENCounter  7500 non-null   int64 
 3   DECounter  7500 non-null   int64 
 4   CountDiff  7500 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 351.6+ KB


### Model Preparation

In [None]:
import collections

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
#from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
# Tokenizer - to prep data for RNN
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer
#Test
test = df.iloc[:5,0]
text_tokenized, text_tokenizer = tokenize(test)
print(text_tokenizer.word_index)
for sample_i, (sent, token_sent) in enumerate(zip(test, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{"'the'": 1, "'": 2, "'for'": 3, "'to'": 4, "'commission'": 5, "'regulation'": 6, 'ec': 7, "'no'": 8, "'of'": 9, "'invitation'": 10, "'tender'": 11, "'european'": 12, "'1788": 13, "2004'": 14, "'15'": 15, "'october'": 16, "'2004'": 17, "'fixing'": 18, "'minimum'": 19, "'selling'": 20, "'prices'": 21, "'butter'": 22, "'150th'": 23, "'individual'": 24, "'under'": 25, "'standing'": 26, "'provided'": 27, "'in'": 28, "'2571": 29, "97'": 30, "'communities": 31, "'having'": 32, "'regard'": 33, "'treaty'": 34, "'establishing'": 35, "'community": 36}
Sequence 1 in x
  Input:  ['commission', 'regulation', '(ec)', 'no', '1788/2004']
  Output: [5, 6, 2, 7, 2, 8, 13, 14]
Sequence 2 in x
  Input:  ['of', '15', 'october', '2004']
  Output: [9, 15, 16, 17]
Sequence 3 in x
  Input:  ['fixing', 'the', 'minimum', 'selling', 'prices', 'for', 'butter', 'for', 'the', '150th', 'individual', 'invitation', 'to', 'tender', 'under', 'the', 'standing', 'invitation', 'to', 'tender', 'provided', 'for', 'in', 'regul

In [None]:
# Pad function to standardize the length of the sentences 
def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

#Test
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [ 5  6  2  7  2  8 13 14]
  Output: [ 5  6  2  7  2  8 13 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]
Sequence 2 in x
  Input:  [ 9 15 16 17]
  Output: [ 9 15 16 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]
Sequence 3 in x
  Input:  [18  1 19 20 21  3 22  3  1 23 24 10  4 11 25  1 26 10  4 11 27  3 28  6
  2  7  2  8 29 30]
  Output: [18  1 19 20 21  3 22  3  1 23 24 10  4 11 25  1 26 10  4 11 27  3 28  6
  2  7  2  8 29 30]
Sequence 4 in x
  Input:  [ 1  5  9  1 12 31  2]
  Output: [ 1  5  9  1 12 31  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]
Sequence 5 in x
  Input:  [32 33  4  1 34 35  1 12 36  2]
  Output: [32 33  4  1 34 35  1 12 36  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [None]:
from sklearn.model_selection import train_test_split

X = df['EN']
y = df['DE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, random_state=42)

In [None]:
# Preprocess function
def preprocess(x,y):
    preprocess_x, x_token = tokenize(x)
    preprocess_y, y_token = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
   
    return preprocess_x, preprocess_y, x_token, y_token 

#Apply to data
pre_EN, pre_DE, EN_token, DE_token= preprocess(X_train, y_train)

max_EN_length = pre_EN.shape[1]
max_DE_length = pre_DE.shape[1]
EN_vocab_size = len(EN_token.word_index)
DE_vocab_size = len(DE_token.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_EN_length)
print("Max German sentence length:", max_DE_length)
print("English vocabulary size:", EN_vocab_size)
print("German vocabulary size:", DE_vocab_size)

Data Preprocessed
Max English sentence length: 212
Max German sentence length: 203
English vocabulary size: 11364
German vocabulary size: 15938


In [None]:
# Function to give the German translation
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ''

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
# Model builder
def simple_model(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
    learning_rate = 0.005
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(german_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Reshaping the input to work with a basic RNN
tmp_x = pad(pre_EN, max_DE_length)
tmp_x = tmp_x.reshape((-1, pre_DE.shape[-2], 1))

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_DE_length,
    EN_vocab_size,
    DE_vocab_size)

print(simple_rnn_model.summary())

simple_rnn_model.fit(tmp_x, pre_DE, batch_size=512, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], DE_token))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_4 (GRU)                 (None, 203, 256)          198912    
                                                                 
 time_distributed_8 (TimeDis  (None, 203, 1024)        263168    
 tributed)                                                       
                                                                 
 dropout_4 (Dropout)         (None, 203, 1024)         0         
                                                                 
 time_distributed_9 (TimeDis  (None, 203, 15938)       16336450  
 tributed)                                                       
                                                                 
Total params: 16,798,530
Trainable params: 16,798,530
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoc

In [None]:
tmp_x.shape

(7500, 203, 1)

<keras.preprocessing.text.Tokenizer at 0x7fda50bd30d0>

In [72]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], DE_token))
print(' ')
print('Correct German translation: ', y_train[:2])
print(' ')
print('Original English text: ', X_train[:1])

Prediction:
'artikel'                                                                                                                                                                                                          
 
Correct German translation:  48243                                     ['artikel', '1']
3057     ['das', 'im', 'innern', 'des', 'packstücks', '...
Name: DE, dtype: object
 
Original English text:  48243    ['article', '1']
Name: EN, dtype: object
