# Neural Machine Translation Example

In [None]:
#! usr/bin/env python3
# -*- coding : utf-8 -*-

'''
author: aggarwal

'''

In [None]:
import sys

In [None]:
!{sys.executable} -m pip install pandas numpy scikit-learn

In [None]:
import pandas as pd

In [None]:
filename='/home/yasmeen/Documents/Uni DuE/lang-tech-teaching/langtech/notebooks/New_Notebooks/data/eng_deu.txt'

parallel_corpora = pd.read_csv(filename, sep='\t', header = None, names=['eng', 'deu'], encoding = "UTF-8")

In [None]:
parallel_corpora.head(5)

In [None]:
# remove punctuation -- excercise
import string
parallel_corpora['eng'] = parallel_corpora['eng'].str.replace('[{}]'.format(string.punctuation), '')
parallel_corpora['deu'] = parallel_corpora['deu'].str.replace('[{}]'.format(string.punctuation), '')

In [None]:
parallel_corpora.head(5)

In [None]:
limited_parallel_corpora = parallel_corpora[:10000]

In [None]:
# take first 10000 and do train-test

from sklearn.model_selection import train_test_split
train, test = train_test_split(limited_parallel_corpora, test_size=0.2)

In [None]:
train[:5]

In [None]:
test[:5]

In [None]:
#install keras

!{sys.executable} -m pip install keras

In [None]:
!{sys.executable} -m pip install tensorflow

In [None]:
# apply tokenizer
from keras.preprocessing.text import Tokenizer
tokenizer_eng = Tokenizer()
tokenizer_ger = Tokenizer()
tokenizer_eng.fit_on_texts(list(limited_parallel_corpora['eng'].values))
tokenizer_ger.fit_on_texts(list(limited_parallel_corpora['deu'].values))


In [None]:
tokenizer_eng.word_index.items()

In [None]:
# vocab size

'''
+1 needed because if you use the pad_sequence to process the sequence, you will find the 0 is used as the padding value. 
In order to distinguish between PAD and UNKNOWN, keras use word_count+1 as the index of UNKNOWN.
'''
eng_vocab_size = len(tokenizer_eng.word_index) + 1
ger_vocab_size = len(tokenizer_ger.word_index) + 1

In [None]:
print(eng_vocab_size)
print(ger_vocab_size)

In [None]:
# sequencing and padding


from keras.preprocessing.sequence import pad_sequences
max_length = 30

# training set

trainX = tokenizer_ger.texts_to_sequences(train['deu'].values)
trainY = tokenizer_eng.texts_to_sequences(train['eng'].values)

trainX = pad_sequences(trainX, maxlen=max_length, padding='post')
trainY = pad_sequences(trainY, maxlen=max_length, padding='post')


# validation set

testX = tokenizer_ger.texts_to_sequences(test['deu'].values)
testY = tokenizer_eng.texts_to_sequences(test['eng'].values)

testX = pad_sequences(testX, maxlen=max_length, padding='post')
testY = pad_sequences(testY, maxlen=max_length, padding='post')

In [None]:
trainY[:1]

In [None]:
#label encoding
import numpy as np
from keras.utils import to_categorical

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

trainY = encode_output(trainY, eng_vocab_size)
testY = encode_output(testY, eng_vocab_size)

In [None]:
trainY[:1]

In [None]:
# generate model
from keras.models import Sequential
from keras.models import load_model
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [None]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, 30, 30, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
model.summary()

In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=10, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint])

In [None]:
model = load_model('model.h5')

In [None]:
testX[:1]

In [None]:
source_sentence = test['deu'].values[0]

print(source_sentence)

In [None]:
# see predictions of first test sentence
pridiction_probabilities = model.predict(testX[:1], verbose=0)[0]
    
print(pridiction_probabilities)

In [None]:
# get word encoded intergers out of probability map

integers = [np.argmax(vector) for vector in pridiction_probabilities]

print(integers)

In [None]:
# convert integers into words
eng_sentence = []
for each_int in integers:
	for word, index in tokenizer_eng.word_index.items():
		if index == each_int:
			eng_sentence.append(word)

translated_sentence = ' '.join(eng_sentence)

print(translated_sentence)

In [None]:
actual_sentence = test['eng'].values[0]

print(actual_sentence)

In [None]:
# evaluate blue scores


from nltk.translate.bleu_score import corpus_bleu


bleu_score = corpus_bleu([[actual_sentence.split()]], [translated_sentence.split()])

print(bleu_score)

In [None]:
input_sentence = input('please write a German source sentence')

In [None]:
input_sentence

In [None]:
input_sentence_encoded = tokenizer_ger.texts_to_sequences([input_sentence])
print(input_sentence_encoded)

input_sentence_encoded = pad_sequences(input_sentence_encoded, maxlen=max_length, padding='post')
print(input_sentence_encoded)

In [None]:
input_sentence_encoded

In [None]:
pridiction_probabilities_inp = model.predict(input_sentence_encoded[:1], verbose=0)[0]
    
print(pridiction_probabilities_inp)
integers_inp = [np.argmax(vector) for vector in pridiction_probabilities_inp]

print(integers_inp)

In [None]:
# convert integers into words
eng_sentence = []
for each_int in integers_inp:
	for word, index in tokenizer_eng.word_index.items():
		if index == each_int:
			eng_sentence.append(word)

translated_sentence = ' '.join(eng_sentence)

print(translated_sentence)

In [None]:
#EVALUTATION

eval_file =  pd.read_csv("eng_deu_evaluation.txt", sep='\t', header = None, names=['eng', 'ger'], encoding = "UTF-8")
eval_file['eng'] = eval_file['eng'].str.replace('[{}]'.format(string.punctuation), '')
eval_file['ger'] = eval_file['ger'].str.replace('[{}]'.format(string.punctuation), '')

evalX = tokenizer_ger.texts_to_sequences(eval_file['ger'].values)
evalX = pad_sequences(evalX, maxlen=max_length, padding='post')

In [None]:
prediction_probabilities = model.predict(evalX, verbose=0)

In [None]:
integers = [[np.argmax(vector) for vector in array] for array in prediction_probabilities]

predicted_sentences = []
for array in integers:
    eng_sentence = []
    for each_int in array:
        for word, index in tokenizer_eng.word_index.items():
            if index == each_int:
                eng_sentence.append(word)
                
    translated_sentence = ' '.join(eng_sentence)
    predicted_sentences.append(translated_sentence)
print(predicted_sentences)

In [None]:
#gold sentences
print(eval_file['eng'].values)

In [None]:
bleu_score = corpus_bleu([sentence.split() for sentence in eval_file['eng'].values], [translated_sentence.split() for translated_sentence in predicted_sentences])
print(bleu_score)