In [None]:
import urllib.request

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/deu.txt"
filename = "deu.txt"

# Download the file
urllib.request.urlretrieve(url, filename)
print(f"File '{filename}' downloaded successfully.")

File 'deu.txt' downloaded successfully.


In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import urllib.request

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
filename = "deu.txt"
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [da

In [None]:
from pickle import dump

# Example cleaned data (replace with your actual data)
cleaned_data = [
    ("English sentence 1", "German translation 1"),
    ("English sentence 2", "German translation 2"),
    # ... more sentence pairs ...
]

# Save the cleaned data as pickle files
dump(cleaned_data, open('english-german-both.pkl', 'wb'))
dump(cleaned_data[:8000], open('english-german-train.pkl', 'wb'))  # Adjust the slice as needed
dump(cleaned_data[8000:], open('english-german-test.pkl', 'wb'))   # Adjust the slice as needed


In [None]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# Load datasets
dataset = load_clean_sentences('english-german.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % eng_length)

# Prepare German tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % ger_length)

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, [pair[1] for pair in train])
trainY = encode_sequences(eng_tokenizer, eng_length, [pair[0] for pair in train])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, [pair[1] for pair in test])
testY = encode_sequences(eng_tokenizer, eng_length, [pair[0] for pair in test])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 14849
English Max Length: 47
German Vocabulary Size: 30944
German Max Length: 53
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 53, 256)           7921664   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 47, 256)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 47, 256)           525312    
                                                                 
 time_distributed (TimeDistr  (None, 47, 14849)        3816193   
 ibuted)                                                         
                



1/1 - 5s - loss: 9.6059 - 5s/epoch - 5s/step
Epoch 2/30




1/1 - 1s - loss: 9.5936 - 567ms/epoch - 567ms/step
Epoch 3/30




1/1 - 1s - loss: 9.5759 - 695ms/epoch - 695ms/step
Epoch 4/30




1/1 - 1s - loss: 9.5418 - 679ms/epoch - 679ms/step
Epoch 5/30




1/1 - 0s - loss: 9.4645 - 452ms/epoch - 452ms/step
Epoch 6/30




1/1 - 0s - loss: 9.2651 - 432ms/epoch - 432ms/step
Epoch 7/30




1/1 - 0s - loss: 8.8644 - 436ms/epoch - 436ms/step
Epoch 8/30




1/1 - 0s - loss: 8.3662 - 459ms/epoch - 459ms/step
Epoch 9/30




1/1 - 0s - loss: 7.8609 - 425ms/epoch - 425ms/step
Epoch 10/30




1/1 - 0s - loss: 7.3569 - 462ms/epoch - 462ms/step
Epoch 11/30




1/1 - 0s - loss: 6.8360 - 433ms/epoch - 433ms/step
Epoch 12/30




1/1 - 0s - loss: 6.2784 - 463ms/epoch - 463ms/step
Epoch 13/30




1/1 - 0s - loss: 5.6761 - 424ms/epoch - 424ms/step
Epoch 14/30




1/1 - 0s - loss: 5.0340 - 472ms/epoch - 472ms/step
Epoch 15/30




1/1 - 0s - loss: 4.3672 - 464ms/epoch - 464ms/step
Epoch 16/30




1/1 - 1s - loss: 3.6965 - 549ms/epoch - 549ms/step
Epoch 17/30




1/1 - 1s - loss: 3.0439 - 671ms/epoch - 671ms/step
Epoch 18/30




1/1 - 0s - loss: 2.4301 - 444ms/epoch - 444ms/step
Epoch 19/30




1/1 - 0s - loss: 1.8758 - 427ms/epoch - 427ms/step
Epoch 20/30




1/1 - 0s - loss: 1.4020 - 447ms/epoch - 447ms/step
Epoch 21/30




1/1 - 0s - loss: 1.0257 - 444ms/epoch - 444ms/step
Epoch 22/30




1/1 - 0s - loss: 0.7505 - 428ms/epoch - 428ms/step
Epoch 23/30




1/1 - 0s - loss: 0.5640 - 455ms/epoch - 455ms/step
Epoch 24/30




1/1 - 0s - loss: 0.4440 - 430ms/epoch - 430ms/step
Epoch 25/30




1/1 - 0s - loss: 0.3687 - 478ms/epoch - 478ms/step
Epoch 26/30




1/1 - 1s - loss: 0.3214 - 540ms/epoch - 540ms/step
Epoch 27/30




1/1 - 1s - loss: 0.2902 - 680ms/epoch - 680ms/step
Epoch 28/30




1/1 - 1s - loss: 0.2680 - 679ms/epoch - 679ms/step
Epoch 29/30




1/1 - 0s - loss: 0.2507 - 469ms/epoch - 469ms/step
Epoch 30/30




1/1 - 0s - loss: 0.2363 - 469ms/epoch - 469ms/step


<keras.callbacks.History at 0x7fe3b46b0040>

In [None]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import itertools

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        if source.shape[0] == 0:
            continue  # Skip empty sequences
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())  # Store as list without nested list
        predicted.append(translation.split())  # Store as list without nested list

    if actual and predicted:  # Check if there are non-empty sequences
        # calculate BLEU score
        print('Calculating BLEU scores...')
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    else:
        print("No non-empty sequences to evaluate.")

# Load datasets
dataset = load_clean_sentences('english-german.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# Prepare English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % eng_length)

# Prepare German tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % ger_length)

# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, [list(pair) for pair in train])
testX = encode_sequences(ger_tokenizer, ger_length, [pair[1] for pair in test])

model_path = 'model.h5'
# save model and architecture to single file
model.save("model.h5")
print("Saved model to disk")
model = load_model(model_path)
# summarize model.
model.summary()
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
# Evaluate the model on the test set
evaluate_model(model, eng_tokenizer, testX, test)

English Vocabulary Size: 14849
English Max Length: 47
German Vocabulary Size: 30944
German Max Length: 53
Saved model to disk
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 53, 256)           7921664   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 47, 256)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 47, 256)           525312    
                                                                 
 time_distributed (TimeDistr  (None, 47, 14849)        3816193   
 ibuted)                                                      

# New Section