# Text Generation with LSTM

In [None]:
# !pip install -r requirements.txt

In [None]:
import warnings, os, uuid
import matplotlib.pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from distutils.version import LooseVersion

from tensorflow.keras import layers, models
from tensorflow.keras import optimizers

from elephas.utils.rdd_utils import to_simple_rdd
from elephas.spark_model import SparkModel

# import horovod.spark.keras as hvd
# from horovod.spark.common.store import Store

import functions as f
from Text import *
from LSTM_class import *

In [None]:
# ========================= #
# CODE GENERATION WITH LSTM #
# ========================= #

# ignore warnings
warnings.filterwarnings('ignore') # ignorer les signes lol !
warnings.simplefilter(action='ignore',  category=FutureWarning) # cacher les alerts (les ignorer)

In [None]:
# set seeds for reproducability
tf.random.set_seed(2)

# init spark session
# ajout du GPU dans notre spark

conf = SparkConf()
conf.setAppName('NLG_with_LSTM').setMaster('local[*]')
conf.set("spark.executor.resource.gpu.amount", '4')
conf.set("spark.task.resource.gpu.amount", '1')
conf.set("spark.sql.shuffle.partitions", '16')
conf.set("spark.driver.memory", "4g")
spark = SparkContext(conf=conf)


In [None]:
spark

## Text preprocessing 
cree un merge et un vocab essentiel a la prediction final

In [None]:
# lecture du jeu de fichiers
input_train = f.read_dir()

In [None]:
max_len = 2 # sequence len by
step = 1 # le pas {avance de 1 mot a chaque sequence}

The text is split into sequences of length 2 (max_len parameter) with step 1. We can see that the first sequence of 2 words starts with the first (0-index) word and the second sequence starts after 1 words, so from the 2nd word (1-index).

In [None]:
print("Total des caracteres", len(input_train))
text_train = Text(input_train)
text_train.tokens_info()

seq_train = Sequences(text_train, max_len, step)
seq_train.sequences_info()

In [None]:
print(text_train.tokens[:10])
print(text_train.tokens_ind[:10])

np.array(seq_train.sequences[:2])

TextDataGenerator is a Python generator that outputs batches of data (sequences and corresponding next words). 
Since the vocabulary size is over 800K, it's impossible to fit all data to the memory and that's why **batch generator** is extremely useful.

In [None]:
batch_size = 4096 # nombre de sequence a prendre en compte dans le fit
layer_size = 64 # nombre de neuronnes
nb_epoch = 2

params = {
    'sequence_length': max_len,
    'vocab_size': len(text_train),
    'batch_size': batch_size,
    'shuffle': True
}

train_generator = TextDataGenerator(spark=spark, sequences=seq_train.sequences, next_words=seq_train.next_words, **params)

---

## Training the LSTM model

We'll build a simple model with one LSTM layer, dropout and dense layer with softmax activation (to return word probabilities).

In [None]:
# def pour la creation de notre model
def lstm_model(sequence_length, vocab_size, layer_size, embedding=False):
    model = models.Sequential()
    if embedding:
        model.add(layers.Embedding(vocab_size, layer_size))
        model.add(layers.Bidirectional(layers.LSTM(layer_size)))
        model.add(layers.Dropout(0.5))
    else:
        model.add(layers.LSTM(layer_size, input_shape=(sequence_length, vocab_size)))
        model.add(layers.Dropout(0.5))
    model.add(layers.Dense(vocab_size, activation='relu'))
    return model

In [None]:
# creation du model
model = lstm_model(sequence_length=max_len, vocab_size=len(text_train), layer_size=layer_size)

# initialisation de l'optimizer
# optimizer = optimizers.Adamax(learning_rate=0.01)
# optimizer = optimizers.RMSprop(learning_rate=0.01)
optimizer = optimizers.Adam(learning_rate=0.01)

# initialisation de le la loss function
loss = tf.keras.losses.mean_squared_error

# compile our model
model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [None]:
# afficher un recap des parametres de chaque couche
model.summary()

In [None]:
# désactiver le GPU lors de la construction du modèle pour éviter le debordement de la mémoire
if LooseVersion(tf.__version__) >= LooseVersion('2.0.0'):
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
else:
    keras.backend.set_session(tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})))

In [None]:
# fit le model (train) spark - elephas
# spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

# lancement du fit avec toutes les données
# spark_model.fit(train_generator.generate_rdds(), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

# lancement du fit avec un seil batch
# spark_model.fit(train_generator.generate_1_rdd(index=0), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

---

In [None]:
# lancement du train avec les couches LSTM et dropout avec keras
model.fit(train_generator, batch_size=batch_size, steps_per_epoch=len(train_generator), epochs=nb_epoch, verbose=1)

In [None]:
# sauvegarde du model
# model.save('data/out/lstm_model_simple')
#f.save_pickle(model, 'data/pkl/lstm_model_simple')

# load un ancien model
model = models.load_model('data/out/lstm_model_simple')
# model = f.load_pickle('data/out/lstm_model_simple')

---

## Text generation with LSTM model

Generating text with LSTM model requires building the prediction loop which starts with choosing a prefix and setting the number of words to generate. Then we need to predict the next word using our LSTM model and use this word as part of the prefix for the next model input. The loop is executed until the expected number of words is generated.

In [None]:
token2ind, ind2token = text_train.token2ind, text_train.ind2token
# sequence initiale
input_prefix = """
    from tensorflow.python.framework import dtypes
    from tensorflow.python.framework import ops
"""
# tokenization de la sequence initiale
text_prefix = Text(input_prefix, token2ind, ind2token)

In [None]:
# prediction a partir d'une sequence
pred = ModelPredict(model, text_prefix, token2ind, ind2token, max_len)

In [None]:
temperatures = [1, 0.7, 0.4, 0.1] # initialisation de la liste de temperature

for temperature in temperatures:
    print('temperature:', temperature)
    print(pred.generate_sequence(50, temperature=temperature))
    print('\n')

---

## Text generation with LSTM model with Embedding layer

The previous model was taking as an input the sequences of words represented as one-hot vectors. In the second approach, we'll feed indexes of words to the model and train the Embedding layers which will create word representations.

In [None]:
params_emb = params.copy() # on recopie les parametres definie pus haut
params_emb['embedding'] = True # on initialise embedding a true pour utiliser to_categorical()

train_generator_emb = TextDataGenerator(spark, seq_train.sequences, seq_train.next_words, **params_emb)

In [None]:
# création d'un nouveau model avec les couche embedding
model_emb = lstm_model(max_len, len(text_train), layer_size, embedding=True)

# initialisation de l'optimizer
# optimizer = optimizers.Adamax(learning_rate=0.01)
# optimizer = optimizers.RMSprop(learning_rate=0.01)
optimizer = optimizers.Adam(learning_rate=0.01)

# initialisation de le la loss function
loss = tf.keras.losses.mean_squared_error

# model_emb.compile(loss='BinaryCrossentropy', optimizer=optimizer, metrics=['accuracy'])
model_emb.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [None]:
# afficher un recap des parametres de chaque couche
model_emb.summary()

In [None]:
# fit le model (train) spark - elephas
spark_model = SparkModel(model_emb, frequency='epoch', mode='asynchronous')

# lancement du fit avec toutes les données
# spark_model.fit(train_generator.generate_rdds(), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

# lancement du fit avec un seil batch
# spark_model.fit(train_generator.__getitem__(0), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

In [None]:
# lancement du train avec les couches embedding et LSRM
# model_emb.fit(train_generator_emb, batch_size=batch_size, steps_per_epoch=len(train_generator_emb), epochs=nb_epoch, verbose=1)

In [None]:
# sauvegarde du model
# model_emb.save('data/out/lstm_model_emb')
# f.save_pickle(model_emb, 'data/out/lstm_model_emb')

# load un ancien model
# model_emb = models.load_model('data/out/lstm_model_emb')
# model_emb = f.load_pickle('data/out/lstm_model_emb')

In [None]:
token2ind, ind2token = text_train.token2ind, text_train.ind2token
# sequence initiale
input_prefix = """
    from tensorflow.python.framework import dtypes
    from tensorflow.python.framework import ops
"""
# tokenization de la sequence initiale
text_prefix = Text(input_prefix, token2ind, ind2token)

In [None]:
# generation des predictions a partir de la meme sequence
pred_emb = ModelPredict(model_emb, text_prefix, token2ind, ind2token, max_len, embedding=True)

In [None]:
temperatures = [1, 0.7, 0.4, 0.1] # initialisation de la liste de temperature

for temperature in temperatures:
    print('temperature:', temperature)
    print(pred.generate_sequence(50, temperature=temperature))
    print('\n')

---