# INSTRUCTIONS TO RUN THE NOTEBOOK

1. Load the required libraries.
2. Load the pre-fitted tokenizer and tokenized sequence.
3. (3.1) In case you want to train the model from scratch, you can either explore the best hyperparameter values by running the code in the 'Hyperparameter tuning on a subset of data' section, or directly skip to the 'Training' section.
   (3.2) If you want to use the pretrained model, run the code in the 'Generate text' section, which loads the model and defines the function for text generation. This function accepts as an input both a string of any length of words, as well as a sequence of token indices. The second argument of the function: 'seq_len' must be set to 20, but the input text and the desired number of output words can be modified as the user wants.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import TimeseriesGenerator
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from pickle import dump, load
from keras.utils import to_categorical
import numpy as np 
import random as rn
from keras import backend as K
import string
# from scikeras.wrappers import KerasClassifier
# from sklearn.model_selection import GridSearchCV

In [None]:
# set seeds for reproducibility
seed_value = 0
np.random.seed(seed_value)
rn.seed(seed_value)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
tf.compat.v1.set_random_seed(seed_value)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
# this function eliminates non-alphanumeric tokens and lower-cases all words
def clean(doc):
    tokens = doc.split()
    table = str.maketrans("","",string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

# Load fitted tokenizer and tokenized sequence

In [None]:
# load the tokenizer that was fitted on the entire inital data sequence
tokenizer = load(open('drive/MyDrive/Language_data/tokenizer.pkl', 'rb'))
# Load the tokenized sequence
sequences = load(open('drive/MyDrive/Language_data/sequence_full_tokenized.pkl', 'rb'))

In [None]:
seq_good = list(np.concatenate(sequences))

# Hyperparameter tuning on a subset of data

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

#define hyperparameter search space for tuning
space = {'seq_len': hp.choice('seq_len', [10,20,50]), 
         'embedding': hp.choice('embedding', [500,1000]),
         'units1': hp.choice('units1', [64,128]),
         'units2': hp.choice('units2', [64,128]),
         'batch_size' : hp.choice('batch_size', [128,256]),
         'nb_epochs' : 1,
         'optimizer': hp.choice('optimizer',['adam']),
         'activation': 'relu'}

#We only use 10% of the data for this process, and divide that subset into a training and validation set. 
#The optimization is done by studying the accuracy in the validation set.
train, val = seq_good[0:int(np.round(len(seq_good)*0.1*0.8))], seq_good[int(np.round(len(seq_good)*0.1*0.8)): int(np.round(len(seq_good)*0.1))]
vocab_size = len(tokenizer.word_index) + 1

def create_model(params):
    #create model architecture
    model = Sequential()
    model.add(Embedding(vocab_size, params['embedding'], input_length=params['seq_len']))
    model.add(LSTM(params['units1'], return_sequences=True))
    model.add(LSTM(params['units2']))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    return model
    
def f_nn(params):   
    print ('Params testing: ', params)

    # We need to include this in the function to be optimised to select the best input sequence length
    train_data_gen = TimeseriesGenerator(train, train, length=params['seq_len'], sampling_rate=1,stride=1, batch_size=params['batch_size'])
    val_data_gen = TimeseriesGenerator(val, val, length=params['seq_len'], sampling_rate=1,stride=1, batch_size=params['batch_size'])
    
    model = create_model(params)
    es = EarlyStopping(monitor='accuracy', patience=10)
    callbacks_list = [es]
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_data_gen, epochs=params['nb_epochs'],callbacks=callbacks_list)

    y_true = val_data_gen.labels
    yhat = model.predict(val_data_gen)
    y_pred = [np.argmax(i) for i in yhat]
    
    correct = (y_true == y_pred)
    accuracy = correct.sum() / correct.size
    return {'acc': -accuracy, 'status': STATUS_OK}

trials = Trials()
best = fmin(f_nn, space, algo=tpe.suggest, max_evals=5, trials=trials)
print('best: ', best)

# Training

In [None]:
# Params {'activation': 'relu', 'batch_size': 64, 'embedding': 100, 'nb_epochs': 100, 'optimizer': 'adam', 'seq_len': 20, 'units1': 64, 'units2': 128}

In [None]:
# Divide data intro training and validation
train, val = seq_good[0:int(np.round(len(seq_good)*0.8))], seq_good[int(np.round(len(seq_good)*0.8)):]
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# create data generator
train_data_gen = TimeseriesGenerator(train, train, length=20, sampling_rate=1,stride=20, batch_size=64)
val_data_gen = TimeseriesGenerator(val, val, length=20, sampling_rate=1,stride=20, batch_size=64)

In [None]:
# # Define model architecture accordingly to the results of the previous section
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=20))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
sav = tf.keras.callbacks.ModelCheckpoint('drive/MyDrive/Language_data/models',monitor='val_accuracy',save_freq='epoch',save_best_only=True)
es = EarlyStopping(monitor='accuracy', patience=10)
callbacks_list = [es, sav]
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train model
model.fit(train_data_gen, epochs=100, validation_data = val_data_gen, callbacks=callbacks_list)

# Continue training

In [None]:
# load pretrained models
model = tf.keras.models.load_model('drive/MyDrive/Language_data/models') cambiar a models_new si hacemos mas 

In [None]:
sav = tf.keras.callbacks.ModelCheckpoint('drive/MyDrive/Language_data/models_new',monitor='val_accuracy',save_freq='epoch',save_best_only=True)
es = EarlyStopping(monitor='accuracy', patience=10)
callbacks_list = [es, sav]
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_data_gen, epochs=100, validation_data = val_data_gen, callbacks=callbacks_list)

# Generate text

In [None]:
# load pretrained models
model = tf.keras.models.load_model('drive/MyDrive/Language_data/models')

In [None]:
seed_text = np.array([seq_good[100:110]])

In [None]:
def translate_wordindices_to_words(seed_text):
  # this function translates a list of word indices to actual words  
  seed_translated = []

  for seed in seed_text[0]:
    out_word = ''
    for word, index in tokenizer.word_index.items():
        if index == seed:
          out_word = word
          break
    seed_translated.append(out_word)

  return seed_translated

In [None]:
def translate_words_to_wordindices(textaco):
  # generate sequence of indices to be inputed to the model, following the same pre-processing process as earlier
  teta_c = clean(textaco)
  teta_t = tokenizer.texts_to_sequences(teta_c)
  s = np.array([[ele for sublist in teta_t for ele in sublist]])
  return s

In [None]:
def next_word_predictor(model, seq_len, n_words, seed_thing, typo):
  # This function uses the pre-trained model to generate a user defined number of words to continue a user defined input sequence
  if typo == 'text': # if the input is a phrase or a string of several words, it is transformed into a sequence of numbers to be inputted to the model
    save_thing = seed_thing
    seed_thing = translate_words_to_wordindices(seed_thing)
  else:
    seed_thing = seed_thing
  
  generated_words = []
  generated_indices = []

  if len(seed_text[0]) < seq_len: # if the input sequence length is smaller than required, the sequence is padded
    seed_thing = pad_sequences(seed_thing, maxlen=seq_len, truncating='pre')

  for i in range(n_words): # we are going to predict n_words new words
    
    if len(generated_indices) < seq_len: 
    
      if len(generated_indices) > 0: # if we have generated less words than the input sequence length, the input sequence for the next iteration must contain the new generated words
                                     
        for i in range(len(generated_indices)):
          i = i+1
          seed_thing[0][-i] = generated_indices[len(generated_indices)-i]

      if len(generated_indices) == 0: # if no words have yet been generated, the input is not modified
        seed_thing=seed_thing
    
    if len(generated_indices) >= seq_len: # if we generate more words than the sequence length, the input sequence to predict the next word will be the 20 previously predicted ones
      new_generated = generated_indices[-seq_len:]
      for i in range(len(new_generated)):
        i = i+1
        seed_thing[0][-i] = new_generated[len(new_generated)-i]
    
    
    ypred = model.predict(seed_thing, verbose=0) # predict next word using pretrained model
    yhat = np.array([np.argmax(ypred)])
    generated_indices.append(yhat)
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    generated_words.append(out_word)
  
  # Generate final sequence
  final = ''
  for i in generated_words:
    final = final + ' ' + i

  if typo == 'text':
    print(save_thing + final)
  if typo == 'array':
    trans_input = translate_wordindices_to_words(seed_thing)
    pre = ''
    for i in trans_input:
      pre = pre + ' ' + i
    print(pre+final)
    
  return generated_words

In [None]:
gentxt = next_word_predictor(model, 20, 6, 'The two young men thought that they could survive', 'text')

The two young men thought that they could survive somewhere unlikely entirely to exercise numbers


In [None]:
gentxt = next_word_predictor(model, 20, 5, 'They were having a fun time until the two young men had to', 'text')

They were having a fun time until the two young men had to pass directly above display eleven
