In [None]:
# IMPORTS
import psutil
import re
import os
import numpy as np
import time
import tensorflow as tf
import datetime
import matplotlib.pyplot as plt
import sys
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU
from keras.callbacks import ModelCheckpoint
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.utils import Sequence
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from keras import backend as K
import math
from collections import Counter

In [None]:
#DIRECTORIES
DATASET = "../input/harry-potter-gru-text-generator"
DATA_PATH = "../input/harry-potter-philosophers-stone-preprocessed/Harry_Potter_philosophers_stone.txt"
SAVED_MODEL_PATH = "../input/harry-potter-gru-text-generator/Best_weights.hdf5"
CHECKPOINT_PATH = "Best_weights.hdf5"

In [None]:
#Copy file from Input to Output(to easier create a new dataset with updated weights)
for file in os.listdir(DATASET):
    if file.endswith('hdf5') == False:
        path = os.path.join(DATASET, file)
        !cp -r $path ./

In [None]:
#Load the file
text = open(DATA_PATH, "r", encoding="utf-8").read().lower()
words = text.split()

In [None]:
#trying to remove sentences with low usage words. OPTIONAL
words_unique = Counter(words).most_common()
dictionary = {}
for word in words_unique:
    dictionary[word[0]] = word[1]
dict_values = list(dictionary.values())

words_remove = [list(dictionary.keys())[idx] for idx, val in enumerate(dict_values) if val <= 3 and len(list(dictionary.keys())[idx]) >= 3]
print(len(words_remove))
sentences = re.split('[.!?]', text)
sent = []
sent = [re.sub('[\n]', '', sentence) for sentence in sentences]
sentences = [sentence for sentence in sent if not any(word in sentence for word in words_remove)]
print(len(sentences))

In [None]:
#text preprocessing
endings  = ('.', '!', '?')

for idx, word in enumerate(words):
    if word.endswith(endings) and word not in endings: #spliting special characters . ! ? on the end of the sentence
        words[idx] = re.sub('[.!?]', '', word)
        words.insert(idx+1, word[-1])
    if words[idx].startswith('.') and word not in endings: #spliting special characters . ! ? on the beggining of the sentence
        words[idx] = re.sub('[.]', '', word)
        words.insert(idx-1, '.')
    if re.search('.[.].', words[idx]): #spliting words which have . in between
        w = word.split('.')
        words[idx] = '.'
        words.insert(idx-1, w[0])
        words.insert(idx+1, w[-1])
sentences = re.split('[.!?]', text)
sent = []
sent = [re.sub('[\n]', '', sentence) for sentence in sentences]
new_text = ''.join(sent)
new_text = re.sub('  ', ' ', new_text)
words = new_text.split()        

In [None]:
#Ploting number of unique words
print(len(words_unique))
plt.figure(figsize=(10,10))
plt.plot(dict_values)
plt.show()

In [None]:
#DATA PREPERATION
LEN_SEQUENCE = 5
WORDS_IN_TEXT = len(words)
ALL_WORDS = sorted(set(words))

#Creating generator
class data_sequence(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *
        self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) *
        self.batch_size]

        return batch_x, batch_y

word_to_int = {word:i for i, word in enumerate(ALL_WORDS)}
int_to_word = {v: k for k, v in word_to_int.items()}

X = []
y = []

perc = [per for per in range(10, 101, 10)]
j = 0

for i in range(0, WORDS_IN_TEXT-LEN_SEQUENCE, 1):
    sequence_X = words[i:i+LEN_SEQUENCE]
    sequence_y = words[i+LEN_SEQUENCE]
    X.append([word_to_int[c] for c in sequence_X])
    y.append(word_to_int[sequence_y])   

patterns = len(X)
X_data = np.reshape(X, (patterns, LEN_SEQUENCE, 1))
y_data = np_utils.to_categorical(y)

#Data split
val_samples = int(patterns * 0.80)
X_train = X_data[:val_samples, :, :]
X_test = X_data[val_samples:, :, :]
Y_train = y_data[:val_samples]
Y_test = y_data[val_samples:]

In [None]:
#TRAINING PIPELINE (from weights)

#PARAMETERS
UNITS = 512
DROPOUT = 0.2
BATCH = 64
EPOCH = 1

#CHECKPOINT INITIALIZATION
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='loss', verbose=1, save_best_only=True, mode='min')

#STOP TRAINING IF NO IMPROVE
#early_stop = tf.keras.callbacks.EarlyStopping("loss", patience=2, restore_best_weights=True)

#LOAD WEIGHTS(from dataset)
sequence = data_sequence(X_data, y_data, BATCH)
model = keras.models.load_model(SAVED_MODEL_PATH)
tensorboard_log = tf.keras.callbacks.TensorBoard(f"./logs/{datetime.datetime.now().strftime('%d-%m-%y-%H:%M')}-L:{LEN_SEQUENCE}_U:{UNITS}_D:{DROPOUT}_B:{BATCH}", histogram_freq=1)

#CHANGE lr
K.set_value(model.optimizer.learning_rate, 0.0001)

# START TREINING
model.fit(sequence, epochs=EPOCH, callbacks=[checkpoint, tensorboard_log])

In [None]:
#TRAIN FROM SCRATCH

#PARAMETERS
UNITS = 512
DROPOUT = 0.5
BATCH = 64
EPOCH = 100

#CREATE MODEL
model = Sequential()
model.add(GRU(UNITS, input_shape=(X_data.shape[1], X_data.shape[2]), return_sequences=True))
model.add(Dropout(DROPOUT))
model.add(GRU(UNITS))
model.add(Dropout(DROPOUT))
model.add(Dense(y_data.shape[1], activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")

#CHECKPOINT INITIALIZATION
checkpoint = ModelCheckpoint(CHECKPOINT_PATH, monitor='loss', verbose=1, save_best_only=True, mode='min')

train_sequence = data_sequence(X_train, Y_train, BATCH)
test_sequence = data_sequence(X_test, Y_test, BATCH)
tensorboard_log = tf.keras.callbacks.TensorBoard(f"./logs/{datetime.datetime.now().strftime('%d-%m-%y-%H:%M')}-L:{LEN_SEQUENCE}_U:{UNITS}_D:{DROPOUT}_B:{BATCH}", histogram_freq=1)
model.fit(train_sequence, epochs=EPOCH, callbacks=[checkpoint, tensorboard_log], validation_data=test_sequence)

In [None]:
#parametr tunning with tensorboard

#PARAMETERS
EPOCHS = 5
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([256, 512]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1, 0.2]))
HP_BATCH = hp.HParam('batch', hp.Discrete([32, 64]))
HP_TIME = hp.HParam('time')
HP_SEQUENCE = hp.HParam('Len_sequence')

#HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))

#Directory to save logs
log_dir = "./logs/"

with tf.summary.create_file_writer(log_dir).as_default():
    hp.hparams_config(
        hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_BATCH, HP_TIME, HP_SEQUENCE],
        metrics=[
        hp.Metric(tag="categorical_crossentropy", display_name="Loss")
        ]
    )
        
    
#defining model with changing parameters    
def train_test_model(hparams, directory):
    model = Sequential()
    model.add(LSTM(hparams[HP_NUM_UNITS], input_shape=(X_data.shape[1], X_data.shape[2]), return_sequences=True))
    model.add(Dropout(hparams[HP_DROPOUT]))
    model.add(LSTM(hparams[HP_NUM_UNITS]))
    model.add(Dropout(hparams[HP_DROPOUT]))
    model.add(Dense(y_data.shape[1], activation="softmax"))
    model.compile(
    optimizer="adam",
    loss='categorical_crossentropy'
    )
    sequence = data_sequence(X_data, y_data, hparams[HP_BATCH])
    start_time = time.time()
    history = model.fit(sequence, epochs=EPOCHS, callbacks=[tf.keras.callbacks.TensorBoard(f"{directory}:{batch_size}", histogram_freq=1)])
    duration = time.time() - start_time
    return history, duration

def run_experiment(directory, hparams):
    with tf.summary.create_file_writer(directory).as_default():
        hist, time = train_test_model(hparams, directory)
        hparams["time"] = (time)/EPOCHS
        hparams["Len_sequence"] = LEN_SEQUENCE
        hp.hparams(hparams)
        for step, loss in enumerate(hist.history["loss"]):
                tf.summary.scalar("categorical_crossentropy", loss, step=step)

session_num = 0

#training loop
for num_units in HP_NUM_UNITS.domain.values:
    for dropout_rate in HP_DROPOUT.domain.values:
        for batch_size in HP_BATCH.domain.values:
            hparams = {
                HP_NUM_UNITS: num_units,
                HP_DROPOUT: dropout_rate,
                HP_BATCH: batch_size
                }
            run_name = "/run-%d" % session_num
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            run_experiment(log_dir+run_name, hparams)
            
            session_num += 1

In [None]:
#compress folder to zip file
import shutil
shutil.make_archive("tensorboard", 'zip', "./logs")

In [None]:
#GENERATE TEXT

#Load model
#SAVED_MODEL_PATH = '../input/harry-potter-gru-text-generator/Best_weights.hdf5'
#model = keras.models.load_model(SAVED_MODEL_PATH)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

# Function which make our output more random
def temp_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds[0], 1)
    return np.argmax(probas)

start = np.random.randint(0, len(X)-1)
start = 70
pattern = X[start]
pred_text = []
print("Seed:")
print("\"", ' '.join([int_to_word[value] for value in pattern]), "\"")
print("")
temperature = 1 #LOWER VALUE-MORE GENERIC/REPETITIVE/PREDICTIVE TEXT : HIGHER VALUE-MORE SUPRISING/UNPREDICTABLE TEXT
for i in range(120):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(ALL_WORDS))
    prediction = model.predict(x, verbose=0)
    index = temp_index(prediction, temperature)
    result = int_to_word[index]
    seq_in = [int_to_word[value] for value in pattern]
    pred_text.append(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print(' '.join(pred_text))    
print("\nDone.")

In [None]:
#change LR
from keras import backend as K
print(model.optimizer.lr)
K.set_value(model.optimizer.learning_rate, 0.0001)
model.optimizer.lr