In [1]:
from tensorflow import keras
import numpy as np
import tensorflow as tf

########################################################
print(tf.__version__)

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

if tf.test.gpu_device_name(): 
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


########################################################

2.1.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9806952563394038630
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10068632535
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6795952132436190250
physical_device_desc: "device: 0, name: TITAN X (Pascal), pci bus id: 0000:01:00.0, compute capability: 6.1"
]
Default GPU Device: /device:GPU:0


In [2]:
from tika import parser
filepath = "input/karinthy_összes.txt"

with open(filepath, "r", encoding="utf8") as f:
    raw_text = f.read()

In [3]:
print(raw_text[:300])


Karinthy Frigyes 
összes költeménye
Vérmező, 795. május
                 I
Ó, csudacsöndes májusi éj!
Alszik remegőn a hűs Duna tükre -
Lent fekszik a hold a víz fenekén.
Távol a Gellért. - Messze az éjben
Nyulnak a sávok, zöld-feketén.
A Várhegy fekete tömbje
Omlik az éjbe vakon -
Tompa rezes fény


In [4]:
# raw_text = raw['content']
"".join(sorted(set(raw_text.lower())))

'\n !()*,-.0123456789:;?[]abcdefghijklmnopqrstuvwxyzáäéíóöúüčőű‘’”„'

In [22]:
filtered = [char for char in "()*+0123456789§«»–…✝'[]#\\"]
romai = ["I.","II.","III.","IV.","V.","VI.","VII.","VIII.","IX.","X.",
         "XI.","XII.","XIII.","XIV.","XV.","XVI.","XVII.",]

for char in filtered:
    raw_text = raw_text.replace(char, '')


for char in romai:
    raw_text = raw_text.replace(char, '')


raw_text = raw_text.replace('\n', ' ')
raw_text = raw_text.replace('\t', ' ')

"".join(sorted(set(raw_text.lower())))


' !,-.:;?abcdefghijklmnopqrstuvwxyzáäéíóöúüčőű‘’”„'

In [23]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([raw_text])

In [24]:
tokenizer.texts_to_sequences(["First"])

[[25, 12, 11, 5, 4]]

In [25]:
tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(["First"]))

['f i r s t']

In [26]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

max_id, dataset_size

(49, 1)

In [27]:
# Encoding the full text (substract 1 to get IDs from 0-38 rather than 1-39)
[encoded] = np.array(tokenizer.texts_to_sequences([raw_text])) - 1
# train_size = dataset_size * 90 // 100
train_size = len(encoded) * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])


# dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
# [encoded] = np.array(tokenizer.texts_to_sequences([raw_text])) -1

n_steps = 100
window_length = n_steps + 1 
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)
# dataset = dataset.window(window_length, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))

np.random.seed(42)
tf.random.set_seed(42)

batch_size = 50
dataset= dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:, 1:]))

dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [28]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(50, 100, 49) (50, 100)


In [29]:
# model = keras.models.Sequential([
#     keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax')),
# ])


model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     # no dropout in stateful RNN (https://github.com/ageron/handson-ml2/issues/32)
                     # dropout=0.2, recurrent_dropout=0.2,
                     ),
    keras.layers.GRU(128, return_sequences=True,
                     # dropout=0.2, recurrent_dropout=0.2
                    ),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=20)






# model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer="adam")


Train for 3550 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
model_name = "Karinthy_2_GRU_1_TimeDist_Dense_no_dropout"
# !mkdir -p model_name
model.save(model_name)

# new_model = tf.keras.models.load_model('saved_model/my_model')

INFO:tensorflow:Assets written to: Karinthy_2_GRU_1_TimeDist_Dense_no_dropout\assets


In [31]:
# USING THE MODEL TO PRED CHAR
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

X_new = preprocess(["ej mi a k"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

'i'

In [None]:
# tf.random.set_seed(42)
# tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()


In [32]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [33]:
tf.random.set_seed(42)
next_char("valam", temperature=1)

'i'

In [34]:

def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [35]:
tf.random.set_seed(42)

# print(complete_text("t", temperature=0.2))
print(complete_text("Zsuzsanna ",n_chars=250 ,temperature=0.8))
# print(complete_text("t", temperature=2))



































Zsuzsanna földrajuk a sárga magaddal ide létsza saradin, hortok nem ruhái telfe bízodéli lomuljam. de minden! zöld és nékül és itthagynem, sóhajzalmod utána eltenő fordítottam belőle s még mintha velőte ő a világon oltól szépen a nyílábna. . sor.   jaj is... k
