<a href="https://colab.research.google.com/github/Hesham-14/Neural-Network-Projects/blob/main/english_to_german_RNN_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Hesham Mohamed Mohamed
# 8 / 10 / 2021

In [2]:
!pip install reprint

Collecting reprint
  Downloading reprint-0.6.0-py2.py3-none-any.whl (6.0 kB)
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting backports.shutil-get-terminal-size
  Downloading backports.shutil_get_terminal_size-1.0.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: colorama, backports.shutil-get-terminal-size, reprint
Successfully installed backports.shutil-get-terminal-size-1.0.0 colorama-0.4.4 reprint-0.6.0


In [3]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import reprint

# function: load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, mode='rt', encoding='utf-8')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

# function: split a loaded document into sentences
def to_pair(doc):
  lines = doc.strip().split('\n')
  pairs = [line.split('\t') for line in lines]
  return pairs

# function: clean a list of lines
def clean_pairs(lines):
  cleaned = list()
  # prepare regex for char filtering
  reprint = re.compile('[^%s]' % re.escape(string.printable)) 
  # prepare translation table for removing punctuation
  table = str.maketrans('', '', string.punctuation)
  for pair in lines:
    clean_pair = list()
    for line in pair:
      # normalize unicode characters
      line = normalize('NFD', line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
      # line = line.decode('unicode-escape')
      # tokenize on white space
      line = line.split()
      # convert to lowercase
      line = [word.lower() for word in line]
      #remove punctuation from each token
      line = [word.translate(table) for word in line]
      # remove non-printable char from each token
      line = [reprint.sub('', w) for w in line]
      # remove tokens with numbers in them
      line = [word for word in line if word.isalpha()]
      # store as string
      clean_pair.append(' '.join(line))
    cleaned.append(clean_pair)
  return array(cleaned)

# function: save a list of clean sentences to file
def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)


#   *******   Main function   *******

# load dataset
filename = '/content/drive/MyDrive/deu.txt'
doc = load_doc(filename)
# split into english-germanpairs
pairs = to_pair(doc)
# clean sentences 
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-gernam.pk1')
# spot check
for i in range(100):
  print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))


Saved: english-gernam.pk1
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] => [frag mich]
[ask me] => [fragt mich]
[ask me] => [fragen sie mich]
[at

In [4]:
# Training the model

from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('/content/english-gernam.pk1')

# reduce dataset size
n_sentences = 15000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:12000], dataset[12000:]
# save
save_clean_data(dataset, 'english-german-both.pk1')
save_clean_data(train, 'english-german-train.pk1')
save_clean_data(test, 'english-german-test.pk1')

Saved: english-german-both.pk1
Saved: english-german-train.pk1
Saved: english-german-test.pk1


In [None]:
# MODELING

# Initializing the required Libraries
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.text.Tokenizer import word_index
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# max sentence length
def max_length(lines):
  return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  X = tokenizer.texts_to_sequences(lines)
  # pad sequences with 0 values
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

# one hot encode terget sequence
def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y = array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

# define Neural Machine Translation (NMT) model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  model = Sequential()

  model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  
  return model


# load datasets
dataset = load_clean_sentences('/content/english-german-both.pk1')
train = load_clean_sentences('/content/english-german-train.pk1')
test = load_clean_sentences('/content/english-german-test.pk1')

# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# Prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('English Vocabulary Size: %d' % ger_vocab_size)
print('English Max Length: %d' % (ger_length))

# Preapare Training Data
trainE = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainG = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainE = encode_output(trainE, eng_vocab_size)

# Preapare Validation Data
testE = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
testG = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testE = encode_output(testE, eng_vocab_size)

# define our model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit Model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainG, trainE, epochs=30,
          batch_size=64, 
          validation_data=(testG, testE), 
          callbacks=[checkpoint],
          verbose=2)



In [81]:
# **** Training the model ****

from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer


# max sentence length
def max_length(lines):
  return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  X = tokenizer.texts_to_sequences(lines)
  # pad sequences with 0 values
  X = pad_sequences(X, maxlen=length, padding='post')
  return X

# map an integer to a word
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
  prediction = model.predict(source, verbose=0)[0]
  integers = [argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    raw_src, raw_target = raw_dataset[i,1],raw_dataset[i,0]
    if i < 10:
      print('src=[',raw_src, '], ', 'target=[',raw_target, '], ','predicted=[',translation, ']' )
     # print('src=[%s], target=[%s], predicted=[%s]' % raw_src, raw_target, translation)
    if i > 10:
      break
    actual.append([raw_target.split()])
    predicted.append(translation.split())

  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datatsets
dataset = load_clean_sentences('/content/english-german-both.pk1')
train = load_clean_sentences('/content/english-german-train.pk1')
test = load_clean_sentences('/content/english-german-test.pk1')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# Prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])

# prepare data
trainG = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testG = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('/content/model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainG, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testG, test)



train
src=[ sie braucht es ],  target=[ she needs it ],  predicted=[ she needs it ]
src=[ ist es versalzen ],  target=[ is it too salty ],  predicted=[ is it too salty ]
src=[ wir lieben es ],  target=[ we do love it ],  predicted=[ we love it ]
src=[ zeig mir das mal ],  target=[ show me that ],  predicted=[ let me see that ]
src=[ es ist fast sechs uhr ],  target=[ its almost six ],  predicted=[ its almost six ]
src=[ wir brauchen es jetzt ],  target=[ we need it now ],  predicted=[ we need it now ]
src=[ niederlegen ],  target=[ lie low ],  predicted=[ lie low ]
src=[ ich bin auf dem weg ],  target=[ im on my way ],  predicted=[ im on at way ]
src=[ tom war nicht schlecht ],  target=[ tom wasnt bad ],  predicted=[ tom wasnt bad ]
src=[ rate mal wer gewonnen hat ],  target=[ guess who won ],  predicted=[ guess who won ]
BLEU-1: 0.916667
BLEU-2: 0.834666
BLEU-3: 0.769756
BLEU-4: 0.525053
test
src=[ ich habe gegahnt ],  target=[ i yawned ],  predicted=[ i have homesick ]
src=[ lassen s