In [None]:
import string
import re
import numpy as np
from pickle import dump
from unicodedata import normalize
from numpy import array
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs
def clean_pairs(lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [word.lower() for word in line]
            line = [word.translate(table) for word in line]
            line = [re_print.sub('', w) for w in line]
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
filename = '/content/deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-duetch.pkl')
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-duetch.pkl
[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[duck] => [kopf runter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[hide] => [versteck dich]
[hide] => [versteckt euch]
[stay] => [bleib]
[stop] => [stopp]
[stop] => [anhalten]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[do it] => [mache es]
[do it] => [tue es]
[go on] => [mach weiter]
[hello] => [hallo]
[hello] => [sers]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i hid] => [ich versteckte mich]
[i hid] => [ich habe mich versteckt]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich versuche es]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[i won] => [ich habe gewonnen]
[oh no] => [oh nein]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[sorry] => [entschuldigung]
[ask me] =

In [None]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

raw_dataset = load_clean_sentences('/content/english-duetch.pkl')
n_sentences = 15000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = dataset[:12000], dataset[12000:]
save_clean_data(dataset, 'english-duetch-both.pkl')
save_clean_data(train, 'english-duetch-train.pkl')
save_clean_data(test, 'english-duetch-test.pkl')


Saved: english-duetch-both.pkl
Saved: english-duetch-train.pkl
Saved: english-duetch-test.pkl


In [None]:
from pickle import load
from numpy import array
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint

#Load Clean Sentences
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

#fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

#define Max length of lines
def max_length(lines):
    return max(len(line.split()) for line in lines)

#encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    #integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    #pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

#one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

#define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

#load Datasets
dataset = load_clean_sentences('/content/english-duetch-both.pkl')
train = load_clean_sentences('/content/english-duetch-train.pkl')
test = load_clean_sentences('/content/english-duetch-test.pkl')

#prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

#prepare duetch tokenizer
duetch_tokenizer = create_tokenizer(dataset[:, 1])
duetch_vocab_size = len(duetch_tokenizer.word_index) + 1
duetch_length = max_length(dataset[:, 1])

#preparing training data
trainX = encode_sequences(duetch_tokenizer, duetch_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

#preparing test data
testX = encode_sequences(duetch_tokenizer, duetch_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

#define model
model = define_model(duetch_vocab_size, eng_vocab_size, duetch_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# Build the model by specifying the input shape
model.build(input_shape=(None, duetch_length))
# Print the model summary
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

#fit model
filename = 'model.keras'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=40, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)



None
Epoch 1/40

Epoch 1: val_loss improved from inf to 3.51934, saving model to model.keras
188/188 - 12s - 62ms/step - loss: 4.1338 - val_loss: 3.5193
Epoch 2/40

Epoch 2: val_loss improved from 3.51934 to 3.37888, saving model to model.keras
188/188 - 3s - 15ms/step - loss: 3.3593 - val_loss: 3.3789
Epoch 3/40

Epoch 3: val_loss improved from 3.37888 to 3.18646, saving model to model.keras
188/188 - 5s - 25ms/step - loss: 3.1526 - val_loss: 3.1865
Epoch 4/40

Epoch 4: val_loss improved from 3.18646 to 3.03208, saving model to model.keras
188/188 - 2s - 12ms/step - loss: 2.9484 - val_loss: 3.0321
Epoch 5/40

Epoch 5: val_loss improved from 3.03208 to 2.90493, saving model to model.keras
188/188 - 3s - 15ms/step - loss: 2.7596 - val_loss: 2.9049
Epoch 6/40

Epoch 6: val_loss improved from 2.90493 to 2.76886, saving model to model.keras
188/188 - 5s - 26ms/step - loss: 2.5801 - val_loss: 2.7689
Epoch 7/40

Epoch 7: val_loss improved from 2.76886 to 2.64434, saving model to model.keras


<keras.src.callbacks.history.History at 0x7c48d653fb50>

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

#load clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

#fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
#maximum length sentences
def max_length(lines):
    return max(len(line.split()) for line in lines)
#encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    #integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    #pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

#map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

#generate target using source code
def predicr_target(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

#evaluate the skill model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        #translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predicr_target(model, eng_tokenizer, source)
        raw_target, _, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    #calculate the BLEU Score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

#load datasets
dataset = load_clean_sentences('/content/english-duetch-both.pkl')
train = load_clean_sentences('/content/english-duetch-train.pkl')
test = load_clean_sentences('/content/english-duetch-test.pkl')

#prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

#prepare duetch tokenizer
duetch_tokenizer = create_tokenizer(dataset[:, 1])
duetch_vocab_size = len(duetch_tokenizer.word_index) + 1
duetch_length = max_length(dataset[:, 1])

#prepare Data
trainX = encode_sequences(duetch_tokenizer, duetch_length, train[:, 1])
testX = encode_sequences(duetch_tokenizer, duetch_length, test[:, 1])

#load model
model = load_model('/content/model.keras')
#test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
#test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[ccby france attribution tatoebaorg spamster pfirsichbaeumchen], target=[shes too loud], predicted=[shes too loud]
src=[ccby france attribution tatoebaorg ck vortarulo], target=[hes a gardener], predicted=[hes a gardener]
src=[ccby france attribution tatoebaorg ck muiriel], target=[dinners ready], predicted=[dinner is ready]
src=[ccby france attribution tatoebaorg ck wolfgangth], target=[we were close], predicted=[we were close]
src=[ccby france attribution tatoebaorg ck kolonjano], target=[im optimistic], predicted=[im optimistic]
src=[ccby france attribution tatoebaorg hybrid jensodo], target=[are you blind], predicted=[are you blind]
src=[ccby france attribution tatoebaorg ck alexander], target=[its about time], predicted=[its so time]
src=[ccby france attribution tatoebaorg ck esperantostern], target=[tom lied], predicted=[tom was lying]
src=[ccby france attribution tatoebaorg ck robroy], target=[ill go], predicted=[im go go]
src=[ccby france attribution tatoebaorg ck pne