# Text Generation with LSTM

In [1]:
# !pip install -r requirements.txt

In [2]:
import warnings, os, uuid
import matplotlib.pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from distutils.version import LooseVersion

from tensorflow.keras import layers, models
from tensorflow.keras import optimizers

from elephas.utils.rdd_utils import to_simple_rdd
from elephas.spark_model import SparkModel

# import horovod.spark.keras as hvd
# from horovod.spark.common.store import Store

import functions as f
from Text import *
from LSTM_class import *

In [3]:
# ========================= #
# CODE GENERATION WITH LSTM #
# ========================= #

# ignore warnings
warnings.filterwarnings('ignore') # ignorer les signes lol !
warnings.simplefilter(action='ignore',  category=FutureWarning) # cacher les alerts (les ignorer)

In [4]:
# set seeds for reproducability
tf.random.set_seed(2)

# init spark session
# ajout du GPU dans notre spark

conf = SparkConf()
conf.setAppName('NLG_with_LSTM').setMaster('local[*]')
conf.set("spark.executor.resource.gpu.amount", '4')
conf.set("spark.task.resource.gpu.amount", '1')
conf.set("spark.sql.shuffle.partitions", '16')
conf.set("spark.driver.memory", "4g")
spark = SparkContext(conf=conf)


In [5]:
spark

## Text preprocessing 
cree un merge et un vocab essentiel a la prediction final

In [6]:
# lecture du jeu de fichiers
input_train = f.read_dir()

In [7]:
max_len = 2 # sequence len by
step = 1 # le pas {avance de 1 mot a chaque sequence}

The text is split into sequences of length 2 (max_len parameter) with step 1. We can see that the first sequence of 2 words starts with the first (0-index) word and the second sequence starts after 1 words, so from the 2nd word (1-index).

In [8]:
print("Total des caracteres", len(input_train))
text_train = Text(input_train)
text_train.tokens_info()

seq_train = Sequences(text_train, max_len, step)
seq_train.sequences_info()

Total des caracteres 5568656
total tokens: 442748, distinct tokens: 88118
number of sequences of length 2: 442746


In [9]:
print(text_train.tokens[:10])
print(text_train.tokens_ind[:10])

np.array(seq_train.sequences[:2])

['import', 'numpy', 'as', 'np', 'from', 'tensorflow.python.framework', 'import', 'dtypes', 'from', 'tensorflow.python.framework']
[3545, 65639, 58014, 81302, 26162, 47942, 3545, 9652, 26162, 47942]


array([[ 3545, 65639],
       [65639, 58014]])

TextDataGenerator is a Python generator that outputs batches of data (sequences and corresponding next words). 
Since the vocabulary size is over 800K, it's impossible to fit all data to the memory and that's why **batch generator** is extremely useful.

In [10]:
batch_size = 4096 # nombre de sequence a prendre en compte dans le fit
layer_size = 64 # nombre de neuronnes
nb_epoch = 10

params = {
    'sequence_length': max_len,
    'vocab_size': len(text_train),
    'batch_size': batch_size,
    'shuffle': True
}

# train_generator = TextDataGenerator(spark=spark, sequences=seq_train.sequences, next_words=seq_train.next_words, **params)

---

## Training the LSTM model

We'll build a simple model with one LSTM layer, dropout and dense layer with softmax activation (to return word probabilities).

In [11]:
# def pour la creation de notre model
def lstm_model(sequence_length, vocab_size, layer_size, embedding=False):
    model = models.Sequential()
    if embedding:
        model.add(layers.Embedding(vocab_size, layer_size))
        model.add(layers.Bidirectional(layers.LSTM(layer_size)))
        model.add(layers.Dropout(0.5))
    else:
        model.add(layers.LSTM(layer_size, input_shape=(sequence_length, vocab_size)))
        model.add(layers.Dropout(0.5))
    model.add(layers.Dense(vocab_size, activation='relu'))
    return model

In [10]:
# creation du model
model = lstm_model(sequence_length=max_len, vocab_size=len(text_train), layer_size=layer_size)

# initialisation de l'optimizer
# optimizer = optimizers.Adamax(learning_rate=0.01)
# optimizer = optimizers.RMSprop(learning_rate=0.01)
# optimizer = optimizers.Adam(learning_rate=0.01)

# initialisation de le la loss function
# loss = tf.keras.losses.mean_squared_error

# compile our model
# model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [11]:
# afficher un recap des parametres de chaque couche
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                22574848  
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 88118)             5727670   
                                                                 
Total params: 28,302,518
Trainable params: 28,302,518
Non-trainable params: 0
_________________________________________________________________


In [12]:
# désactiver le GPU lors de la construction du modèle pour éviter le debordement de la mémoire
if LooseVersion(tf.__version__) >= LooseVersion('2.0.0'):
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
else:
    keras.backend.set_session(tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})))

In [None]:
# fit le model (train) spark - elephas
# spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

# lancement du fit avec toutes les données
# spark_model.fit(train_generator.generate_rdds(), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

# lancement du fit avec un seil batch
# spark_model.fit(train_generator.generate_1_rdd(index=0), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

---

In [None]:
# lancement du train avec les couches LSTM et dropout avec keras
model.fit(train_generator, batch_size=batch_size, steps_per_epoch=len(train_generator), epochs=nb_epoch, verbose=1)

In [12]:
# sauvegarde du model
# model.save('data/out/lstm_model_simple')
#f.save_pickle(model, 'data/pkl/lstm_model_simple')

# load un ancien model
model = models.load_model('data/out/lstm_model_simple')
# model = f.load_pickle('data/out/lstm_model_simple')

---

## Text generation with LSTM model

Generating text with LSTM model requires building the prediction loop which starts with choosing a prefix and setting the number of words to generate. Then we need to predict the next word using our LSTM model and use this word as part of the prefix for the next model input. The loop is executed until the expected number of words is generated.

In [16]:
token2ind, ind2token = text_train.token2ind, text_train.ind2token
# sequence initiale
input_prefix = """
    from tensorflow.python.framework import dtypes
"""
# tokenization de la sequence initiale
text_prefix = Text(input_prefix, token2ind, ind2token)

In [17]:
# prediction a partir d'une sequence
pred = ModelPredict(model, text_prefix, token2ind, ind2token, max_len)

In [18]:
temperatures = [1, 0.7, 0.4, 0.1] # initialisation de la liste de temperature

for temperature in temperatures:
    print('temperature:', temperature)
    print(pred.generate_sequence(50, temperature=temperature))
    print('\n')

temperature: 1
from tensorflow.python.framework import dtypes count_val) name='output')(x) trivial shard_index create_variable_like_keras_layer( random_ops.random_normal([1, dtypes.as_dtype(out_type) 46]]]], `suffix`. summary_ops.set_step(42) 8]).astype(np.float32) self.all_reduce( self.assertEqual(xla_shape.dimensions(), array_ops.split(c, assertAllEqualUnicode(self, degradation." row_lengths=[4, arguments, FLAG_NAME_EXCLUDED_OPNAMES) testRunUIWithOnUIExitCallback(self): checkpoint.") `{self.__class__.__name__}.summary()` is_both_nonscalar num_bits=8, allowed return_state=False, enumerate(inputs): super(XlaDeviceGpuTest, convertible(tensor, new_restore_ops(self, CustomUnhashable: self._sharding_policies) self.collect_summary_per_core 6.02427039364742014255E0, row_partition=self._row_partition.with_dtype(dtype),"delimiter DT_STRING compatibility, trt_convert Context. tf.compat.v1.placeholder( _inspect.isbuiltin(tf_decorator.unwrap(object)[1]) get_lib()) bounding tape: `run_eagerly` sna

---

## Text generation with LSTM model with Embedding layer

The previous model was taking as an input the sequences of words represented as one-hot vectors. In the second approach, we'll feed indexes of words to the model and train the Embedding layers which will create word representations.

In [13]:
params_emb = params.copy() # on recopie les parametres definie pus haut
params_emb['embedding'] = True # on initialise embedding a true pour utiliser to_categorical()

train_generator_emb = TextDataGenerator(spark, seq_train.sequences, seq_train.next_words, **params_emb)

In [14]:
# création d'un nouveau model avec les couche embedding
model_emb = lstm_model(max_len, len(text_train), layer_size, embedding=True)

# initialisation de l'optimizer
# optimizer = optimizers.Adamax(learning_rate=0.01)
# optimizer = optimizers.RMSprop(learning_rate=0.01)
optimizer = optimizers.Adam(learning_rate=0.01)

# initialisation de le la loss function
loss = tf.keras.losses.mean_squared_error

# model_emb.compile(loss='BinaryCrossentropy', optimizer=optimizer, metrics=['accuracy'])
model_emb.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [15]:
# afficher un recap des parametres de chaque couche
model_emb.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          5639552   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 88118)             11367222  
                                                                 
Total params: 17,072,822
Trainable params: 17,072,822
Non-trainable params: 0
_________________________________________________________________


In [16]:
# fit le model (train) spark - elephas
spark_model = SparkModel(model_emb, frequency='epoch', mode='asynchronous')

# lancement du fit avec toutes les données
# spark_model.fit(train_generator.generate_rdds(), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

# lancement du fit avec un seil batch
# spark_model.fit(train_generator.__getitem__(0), epochs=nb_epoch, batch_size=batch_size, verbose=1, validation_split=0.1)

In [17]:
# lancement du train avec les couches embedding et LSRM
model_emb.fit(train_generator_emb, batch_size=batch_size, steps_per_epoch=len(train_generator_emb), epochs=nb_epoch, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1f342fd550>

In [18]:
# sauvegarde du model
model_emb.save('data/out/lstm_model_emb')
f.save_pickle(model_emb, 'data/pkl/lstm_model_emb')

# load un ancien model
# model_emb = models.load_model('data/out/lstm_model_emb')
# model_emb = f.load_pickle('data/out/lstm_model_emb')



INFO:tensorflow:Assets written to: data/out/lstm_model_emb/assets


INFO:tensorflow:Assets written to: data/out/lstm_model_emb/assets


INFO:tensorflow:Assets written to: ram://2e222d27-efcf-41e6-a95f-245684851bcb/assets


INFO:tensorflow:Assets written to: ram://2e222d27-efcf-41e6-a95f-245684851bcb/assets


In [19]:
token2ind, ind2token = text_train.token2ind, text_train.ind2token
# sequence initiale
input_prefix = """from tensorflow.python.framework import ops"""
# tokenization de la sequence initiale
text_prefix = Text(input_prefix, token2ind, ind2token)

In [20]:
# generation des predictions a partir de la meme sequence
pred_emb = ModelPredict(model_emb, text_prefix, token2ind, ind2token, max_len, embedding=True)

In [22]:
temperatures = [1, 0.7, 0.4, 0.1] # initialisation de la liste de temperature

for temperature in temperatures:
    print('temperature:', temperature)
    print(pred_emb.generate_sequence(50, temperature=temperature))
    print('\n')

temperature: 1
from tensorflow.python.framework import ops np_input"GetSessionHandle self.run_and_report_benchmark( centered: {y:.4f}".format(**batch)) self.assertAllEqual(grads_alpha_val.shape, self.assertEqual([1.], \[\]"): parameters.adagrad_momentum.epsilon space math_ops.cast(beta, `dataset` f(*args) 0.30485708, all(t.dtype(`tf.summary.trace_on`, @tf.function(input_signature=[[(handle_name, s_1 {"sizes": @dispatch.dispatch_for_api(nn_ops.dropout_v2) no_op b"abcd"]]), linear_initializer match.group('prefix_name') op.device) ops). Coming math_ops.greater_equal(axis, RaggedTensorSpec(self._shape[1:], v2.read_value()'tf.nn.convolution': reversed(_TYPE_CONVERSION_FUNCTION_REGISTRY): custom_objects=None, shape=[None]) predicate. tensor["data_buffer"] self.assertAllEqual(np.array([[b"a", dim_size_dtype) key[0].step"AVERAGE_POOL_2D" print(final_carry_state.shape) key_to_promote self._get_first_op_from_collection(ops.GraphKeys.READY_OP) [constant_op.constant(1.),'PreventGradient', variable

---