In [None]:
from tensorflow import keras
import numpy as np
import tensorflow as tf
from tika import parser
import pickle
import os
import re
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
########################################################
print(tf.__version__)

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

if tf.test.gpu_device_name(): 
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


########################################################

In [None]:
# inputs
input_folder = 'input/'
files = os.listdir(input_folder)

input_data = {}

for file in files:
    extention = file[file.find('.'):]
    if extention == '.pdf':
        content = parser.from_file(input_folder+file)['content']
        print(input_folder+file)    
    elif extention == '.txt':
        with open(input_folder+file, "r", encoding="utf8") as f:
            content = f.read()
        print(input_folder+file)  
        
    else:
        print("ERROR Unindetified type")
        break

    input_data[file[:file.find('.')]] = content
    

input_data.keys()


In [None]:
print(input_data['karinthy_összes'][:500])

In [None]:
# create general tokenization: combine -> filter -> tokenization

# combine texts

all_texts = ""

for k, v in input_data.items():
    print(k, "is added")
#     print(v)
    all_texts += v

with open('all_text.txt', 'w', encoding='utf-8') as w:
    w.write(all_texts)



In [None]:
# filter

filter_char = [char for char in "()*+0123456789§«»–✝'[]#\/:…;"]
# filter_char.extend(["\n", "\t"])

replace_dict = {'á': ["â","ä"],
                "é":["è", "ë"],
                'í':['î'],
                'ő':['ô'],
                'c':['č'],
                '"' : ['”','„'],
                "'": ['‘','’'],
                ".": ["- - - - - - -- - - - - - -- - - - - - -"],
                "! ":['!'],
                "? ":['?'],
                ", ":[','],
                ". ":['.'],
                "...":['. . .'],
                "  ": ["\n", "\t", "     ", "    ", "   ", "   ", "   ", "   "],
                " ":["  ","  ","  "]}

def filter_text(raw_text, filter_char, replace):
    raw_text = re.sub(r'http(.*)', '', raw_text)
    for k, v in replace.items():
        for i in v:
            raw_text = raw_text.replace(i,k)
    for char in filter_char:
        raw_text = raw_text.replace(char, '')
    return raw_text

filtered = filter_text(all_texts, filter_char, replace_dict)


print("BEFORE FILTERING:", "".join(sorted(set(all_texts.lower()))))
print("\n")
print("AFTER FILTERING:\n", "".join(sorted(set(filtered.lower()))))
# print(raw_text[5480:5648])


with open('filtered_text.txt', 'w', encoding='utf-8') as w:
    w.write(filtered)


In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([filtered])

# saving
with open('models/tokenizer_magyar.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tokenizer.texts_to_sequences(["First"])

In [None]:
tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(["First"]))

In [None]:
# max_id = len(tokenizer.word_index) # number of distinct characters
# dataset_size = tokenizer.document_count # total number of characters
#max_id#, dataset_size

np.random.seed(42)
tf.random.set_seed(42)

batch_size = 52
n_steps = 100

# train_size = len(encoded) * 90 // 100
# dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])


model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, batch_size],
                     # no dropout in stateful RNN (https://github.com/ageron/handson-ml2/issues/32)
                     # dropout=0.2, recurrent_dropout=0.2,
                     ),
    keras.layers.GRU(128, return_sequences=True,
                     # dropout=0.2, recurrent_dropout=0.2
                    ),
    keras.layers.TimeDistributed(keras.layers.Dense(batch_size, activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")







# TOKENIZE THE GIVEN INPUT SET
for poet, poems in input_data.items():
    filtered_input = filter_text(poems, filter_char, replace_dict)
    [encoded] = np.array(tokenizer.texts_to_sequences([filtered_input])) - 1
    train_size = len(encoded) * 90 // 100
    dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

    window_length = n_steps + 1 
    dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    dataset= dataset.shuffle(10000).batch(batch_size)
    dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:, 1:]))

    dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=batch_size), Y_batch))
    dataset = dataset.prefetch(1)
    
    earlyStopping = EarlyStopping(monitor='loss', patience=10, verbose=0, mode='min')
    mcp_save = ModelCheckpoint(poet + '.mdl_wts.hdf5', save_best_only=True, monitor='loss', mode='min')
    reduce_lr_loss = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
    history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=50, callbacks=[earlyStopping, mcp_save,reduce_lr_loss])
    model.save('models/' + poet + "_model.h5")
    
#     for X_batch, Y_batch in dataset.take(1):
#         print(X_batch.shape, Y_batch.shape)

In [None]:
# Encoding the full text (substract 1 to get IDs from 0-38 rather than 1-39)
# [encoded] = np.array(tokenizer.texts_to_sequences([raw_text])) - 1
# train_size = dataset_size * 90 // 100
# train_size = len(encoded) * 90 // 100
# dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])


# dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
# [encoded] = np.array(tokenizer.texts_to_sequences([raw_text])) -1

# n_steps = 100
# window_length = n_steps + 1 
# dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)
# # dataset = dataset.window(window_length, shift=1, drop_remainder=True)
# dataset = dataset.flat_map(lambda window: window.batch(window_length))

# np.random.seed(42)
# tf.random.set_seed(42)

# batch_size = 50
# dataset= dataset.shuffle(10000).batch(batch_size)
# dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:, 1:]))

# dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
# dataset = dataset.prefetch(1)

In [None]:
# model = keras.models.Sequential([
#     keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax')),
# ])



# history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=10)






# model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer="adam")


In [None]:
# history = model.fit(dataset, epochs = 20)

# %load_ext tensorboard
# %tensorboard --logdir=logs

In [None]:
# model_name = "Babits_2_GRU_1_TimeDist_Dense_no_dropout"
# !mkdir -p model_name
# model.save(model_name)

# new_model = tf.keras.models.load_model('saved_model/my_model')

In [None]:
# # USING THE MODEL TO PRED CHAR
# def preprocess(texts):
#     X = np.array(tokenizer.texts_to_sequences(texts)) - 1
#     return tf.one_hot(X, max_id)

# X_new = preprocess(["ej mi a k"])
# Y_pred = model.predict_classes(X_new)
# tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

In [None]:
# tf.random.set_seed(42)
# tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()


In [None]:
# def next_char(text, temperature=1):
#     X_new = preprocess([text])
#     y_proba = model.predict(X_new)[0, -1:, :]
#     rescaled_logits = tf.math.log(y_proba) / temperature
#     char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
#     return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
# tf.random.set_seed(42)
# next_char("valam", temperature=1)

In [None]:

# def complete_text(text, n_chars=50, temperature=1):
#     for _ in range(n_chars):
#         text += next_char(text, temperature)
#     return text

In [None]:
# tf.random.set_seed(42)

# # print(complete_text("t", temperature=0.2))
# print(complete_text("Magyar ",n_chars=250 ,temperature=0.8))
# # print(complete_text("t", temperature=2))