In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping as EarlyStopping
from tensorflow.python.client import device_lib
from tensorflow.keras.layers import Activation
print(device_lib.list_local_devices())
import numpy as np
import os
import re
import time
import string
import glob
from string import maketrans

In [0]:
!git clone https://github.com/michalovsky/trilogy_data.git

In [147]:
#Read all file paths from directory
directory = "trilogy_data/"
file_paths = glob.glob(directory +"*.txt")    
print("Found", len(file_paths), "text files in directory:", directory)

Found 6 text files in directory: trilogy_data/


In [187]:
# Extract text from all text files
text = ""

for file_path in file_paths:
    with open(file_path, 'r') as file:
        file_content = file.read()
        #remove file beginning
        file_content = file_content[file_content.find("ISBN") + len("ISBN"):]
        #remove file ending
        file_content = file_content[:file_content.rfind("-----")]
        text+=file_content
print ('Length of text: {} characters'.format(len(text)))

Length of text: 5077327 characters


In [188]:
# Preprocess text

punctuation_translator = str.maketrans('–—”„…«»', '       ', string.punctuation)
digits_translator = str.maketrans('', '', string.digits)
polish_characters_translator = str.maketrans('ąćęłńóśźż', 'acelnoszz', 'äöü')

# remove redundant characters and replace polish characters
text = text.lower().translate(punctuation_translator).translate(digits_translator).translate(polish_characters_translator)

# remove "tom <number>" strings 
text = re.sub(r"\ntom\s(.*)\n", "", text)

# remove "rozdzial <number>" strings 
text = re.sub(r"\nrozdzial\s(.*)\n", "", text)

#remove extra spaces and new lines
text = ' '.join(text.split())

print ('Length of text after preprocessing: {} characters'.format(len(text)))

['i', 'trzeci', 'ii', 'pierwszy', 'drugi']
Length of text after preprocessing: 4827189 characters


In [168]:
# split text into words
words = text.split(" ")
print ('Amount of words: {}'.format(len(words)))

vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

Amount of words: 779507
27 unique characters


In [0]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

#encode text from characters to numbers  
encoded = np.array([char2idx[ch] for ch in text])  

In [170]:
# Print unique characters
print('{')
for char in char2idx:
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('}')

{
  ' ' :   0,
  'a' :   1,
  'b' :   2,
  'c' :   3,
  'd' :   4,
  'e' :   5,
  'f' :   6,
  'g' :   7,
  'h' :   8,
  'i' :   9,
  'j' :  10,
  'k' :  11,
  'l' :  12,
  'm' :  13,
  'n' :  14,
  'o' :  15,
  'p' :  16,
  'q' :  17,
  'r' :  18,
  's' :  19,
  't' :  20,
  'u' :  21,
  'v' :  22,
  'w' :  23,
  'x' :  24,
  'y' :  25,
  'z' :  26,
}


In [0]:
# The maximum length sentence we want for a single input in characters
sequence_length = 100
examples_per_epoch = len(encoded)//sequence_length

# Create trainging examples
char_dataset = tf.data.Dataset.from_tensor_slices(encoded)

# Create sequences from dataset
sequences = char_dataset.batch(sequence_length+1, drop_remainder=True)

In [172]:
#first 10 sentence batches
for item in sequences.take(10):
  print(repr(''.join(idx2char[item.numpy()])))

'tom i rok byl to dziwny rok w ktorym rozmaite znaki na niebie i ziemi zwiastowaly jakowes kleski i na'
'dzwyczajne zdarzenia wspolczesni kronikarze wspominaja iz z wiosny szarancza w nieslychanej ilosci wy'
'roila sie z dzikich pol i zniszczyla zasiewy i trawy co bylo przepowiednia napadow tatarskich latem z'
'darzylo sie wielkie zacmienie slonca a wkrotce potem kometa pojawila sie na niebie w warszawie widywa'
'no tez nad miastem mogile i krzyz ognisty w oblokach odprawiano wiec posty i dawano jalmuzny gdyz nie'
'ktorzy twierdzili ze zaraza spadnie na kraj i wygubi rodzaj ludzki nareszcie zima nastala tak lekka z'
'e najstarsi ludzie nie pamietali podobnej w poludniowych wojewodztwach lody nie popetaly wcale wod kt'
'ore podsycane topniejacym kazdego ranka sniegiem wystapily z lozysk i pozalewaly brzegi padaly czeste'
' deszcze step rozmokl i zmienil sie w wielka kaluze slonce zas w poludnie dogrzewalo tak mocno ze dzi'
'w nad dziwy w wojewodztwie braclawskim i na dzikich polach ziel

In [0]:
# Transform each sequence into two sequences: input(same as sequence), target (shifted by one index)

def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
# First input data and corresponding target data
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'in a hole in the ground there lived a hobbit not a nasty dirty wet hole filled with the ends of worm'
Target data: 'n a hole in the ground there lived a hobbit not a nasty dirty wet hole filled with the ends of worms'


In [0]:
# Shuffle dataset
batch_size = 64
steps_per_epoch = examples_per_epoch//batch_size
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [0]:
# Testing the GPU presence before feeding the model to take advantage of the tensorflow GRU gpu implemenation
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [0]:
# Defining function building model with two GRU Rnn layers and output to dense layer 
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
            return_sequences=True,
            recurrent_initializer='glorot_uniform',
            stateful=True),
        rnn(rnn_units,
            return_sequences=True,
            recurrent_initializer='glorot_uniform',
            stateful=True),
    
        tf.keras.layers.Dense(vocab_size)])
    return model

In [0]:
# Length of the vocabulary (amount of unique characters)
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [0]:
# Build model
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [0]:
# Model informations
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    
model.summary()

(64, 100, 37) # (batch_size, sequence_length, vocab_size)
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (64, None, 256)           9472      
_________________________________________________________________
gru_10 (GRU)                 (64, None, 1024)          3938304   
_________________________________________________________________
gru_11 (GRU)                 (64, None, 1024)          6297600   
_________________________________________________________________
dense_5 (Dense)              (64, None, 37)            37925     
Total params: 10,283,301
Trainable params: 10,283,301
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Get first predictions

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print("Next character predictions: \n", repr("".join(idx2char[sampled_indices ])))

In [0]:
# Define loss function

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [0]:
model.compile(
    optimizer = tf.optimizers.Adam(),
    loss = loss)