In [23]:
dataset_tumbado = './datasetFinal.txt'

In [24]:
from tensorflow import keras as kn

In [25]:
with open(dataset_tumbado,encoding='utf-8') as f:
    corridos_tumbados_text = f.read()

In [26]:
verso = len("""[Letra de "Abriendo el Camino"]

[Verso]
Y los errores del pasado me hicieron ver en donde estoy
Y adonde voy, también quién soy
Salí cabrón, no soy de acción
En California ven mis tráilers
Ahí festejo y brindando estoy
Gracias a Dios ahí va otro gol
Billetes son al por mayor'""")

In [27]:
linea = len("Salí cabrón, no soy de acción")

In [28]:
tokenizar = kn.preprocessing.text.Tokenizer(char_level=True)
tokenizar.fit_on_texts(corridos_tumbados_text)

In [29]:
tokenizar.texts_to_sequences(['Corridos Tumbados'])

[[15, 4, 7, 7, 8, 13, 4, 6, 1, 9, 11, 14, 19, 3, 13, 4, 6]]

In [30]:
tokenizar.sequences_to_texts([[15, 4, 7, 7, 8, 13, 4, 6, 1, 9, 11, 14, 19, 3, 13, 4, 6]])

['c o r r i d o s   t u m b a d o s']

In [31]:
max_id = len(tokenizar.word_index)
dataset_size = tokenizar.document_count

In [32]:
import numpy as np
[encoded] = np.array(tokenizar.texts_to_sequences([corridos_tumbados_text])) - 1

In [33]:
import tensorflow as tf
train_size = int(dataset_size * 90/100)
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [34]:
n_steps = verso
window_length = n_steps + linea


In [37]:
tf.random.set_seed(42)
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size],batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length,shift=n_steps,drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows:(windows[:,:-1],windows[:,1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch,depth=max_id),Y_batch))
dataset = dataset.prefetch(1)
    

In [38]:
from keras.mixed_precision.loss_scale_optimizer import optimizer
model = kn.models.Sequential([
    kn.layers.GRU(128,return_sequences=True,stateful=True,dropout=0.3, recurrent_dropout=0.3,
                  batch_input_shape = [batch_size,None,max_id]),
    kn.layers.GRU(128,return_sequences=True,stateful=True,dropout=0.3,recurrent_dropout=0.3),
    kn.layers.TimeDistributed(kn.layers.Dense(max_id,activation="softmax"))
])


In [39]:
class ResetStatesCallback(kn.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [40]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam")

In [41]:
history = model.fit(dataset,epochs=50,callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

In [None]:
import joblib
neuronas_tumbadas = history
joblib.dump(neuronas_tumbadas,"neuronas_tumbadas.pkl")

In [None]:
stateless_model = kn.models.Sequential([
    kn.layers.GRU(128,return_sequences=True,input_shape=[None,max_id]),
    kn.layers.GRU(128,return_sequences=True),
    kn.layers.TimeDistributed(kn.layers.Dense(max_id),activation='softmax')
])

In [None]:
modelo_neuronas_tumbadas = joblib.load("neuronas_tumbadas.pkl")
stateless_model.build(tf.TensorShape([None,None,max_id]))
stateless_model.set_weights(modelo_neuronas_tumbadas.get_weights())
modelo_neuronas_tumbadas = stateless_model

In [None]:
def preprocess(texts):
    X = np.array(tokenizar.texts_to_sequences(texts))-1
    return tf.one_hot(X,max_id)

In [None]:
def next_char(text,temperature=1):
    X_new = preprocess([text])
    y_proba = modelo_neuronas_tumbadas.predict(X_new)[0,-1:,:]
    rescaled_logits = tf.math.log(y_proba)/temperature
    char_id = tf.random.categorical(rescaled_logits,num_samples=1)+1
    return tokenizar.sequences_to_texts(char_id.numpy())[0]

In [None]:
def generating_corrido_tumbado(text,n_chars = 920,temperature=1):
    for _ in range(n_chars):
        text += next_char(text,temperature)
        corrido = text, '\n', text
        return corrido

In [None]:
X_new = preprocess(["Soy belic"])
Y_pred = modelo_neuronas_tumbadas.predict_classes(X_new)
tokenizar.sequences_to_texts(Y_pred+1)[0][-1]
#expected result 'o'