In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pandas as pd
from sklearn.model_selection import train_test_split

# ============================
# Configuración de parámetros
# ============================
latent_dim = 256  # Reducido para menor consumo de memoria
num_samples = 10000  # Número máximo de muestras
max_input_length = 20  # Reducido para mejorar eficiencia
max_output_length = 200

# Leer el archivo TSV en español con pandas
file_path_es_algo = "massive_dataset_es.tsv"
data_es_algo = pd.read_csv(file_path_es_algo, sep="\t")

# Leer el archivo TSV en inglés con pandas
file_path_en_algo = "massive_dataset_en.tsv"
data_en_algo = pd.read_csv(file_path_en_algo, sep="\t")

# Extraer columnas "Question" y "Answer"
input_texts_es_algo = data_es_algo["Question"].tolist()
output_texts_es_algo = ["<start> " + str(answer_es) + " <end>" for answer_es in data_es_algo["Answer"].tolist()]

input_texts_en_algo = data_en_algo["Question"].tolist()
output_texts_en_algo = ["<start> " + str(answer_en) + " <end>" for answer_en in data_en_algo["Answer"].tolist()]

# Unificación de datasets
input_texts = input_texts_es_algo + input_texts_en_algo
output_texts = output_texts_es_algo + output_texts_en_algo

# ========================
# Preprocesamiento de datos
# ========================
# Tokenización de las secuencias
input_tokenizer = Tokenizer()
output_tokenizer = Tokenizer(filters="")

input_tokenizer.fit_on_texts(input_texts)
output_tokenizer.fit_on_texts(output_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
output_sequences = output_tokenizer.texts_to_sequences(output_texts)

# Agregar padding para las secuencias
encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_length, padding="post")
decoder_input_data = pad_sequences([seq[:-1] for seq in output_sequences], maxlen=max_output_length, padding="post")
decoder_target_data = pad_sequences([seq[1:] for seq in output_sequences], maxlen=max_output_length, padding="post")

# Dividir los datos en entrenamiento y validación
encoder_train, encoder_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2, random_state=42
)

# ===================
# Construcción del modelo
# ===================
# Encoder
encoder_inputs = Input(shape=(None,), dtype="int32")
encoder_embedding = tf.keras.layers.Embedding(input_dim=len(input_tokenizer.word_index) + 1,
                                               output_dim=latent_dim,
                                               mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,), dtype="int32")
decoder_embedding = tf.keras.layers.Embedding(input_dim=len(output_tokenizer.word_index) + 1,
                                               output_dim=latent_dim,
                                               mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(output_tokenizer.word_index) + 1, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo Seq2Seq
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilar el modelo
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Resumen del modelo
model.summary()

# ========================
# Generador de datos
# ========================
def data_generator(encoder_data, decoder_input_data, decoder_target_data, batch_size):
    while True:
        for i in range(0, len(encoder_data), batch_size):
            yield ([encoder_data[i:i + batch_size], decoder_input_data[i:i + batch_size]],
                   decoder_target_data[i:i + batch_size])

# Generadores para entrenamiento y validación
batch_size = 16
train_generator = data_generator(encoder_train, decoder_input_train, decoder_target_train, batch_size)
val_generator = data_generator(encoder_val, decoder_input_val, decoder_target_val, batch_size)

def data_generator(encoder_data, decoder_input_data, decoder_target_data, batch_size):
    def generator():
        for i in range(0, len(encoder_data), batch_size):
            # Convert NumPy arrays to tf.int32 tensors
            encoder_input = tf.cast(encoder_data[i:i + batch_size], dtype=tf.int32)
            decoder_input = tf.cast(decoder_input_data[i:i + batch_size], dtype=tf.int32)
            decoder_target = tf.cast(decoder_target_data[i:i + batch_size], dtype=tf.int32)

            # Yield data in the expected structure (tuple of tuples)
            yield ((encoder_input, decoder_input), decoder_target)
    return generator

# Crear datasets usando tf.data.Dataset.from_generator
batch_size = 16

train_dataset = tf.data.Dataset.from_generator(
    data_generator(encoder_train, decoder_input_train, decoder_target_train, batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, max_input_length), dtype=tf.int32),
            tf.TensorSpec(shape=(None, max_output_length), dtype=tf.int32),
        ),
        tf.TensorSpec(shape=(None, max_output_length), dtype=tf.int32)
    )
).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    data_generator(encoder_val, decoder_input_val, decoder_target_val, batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, max_input_length), dtype=tf.int32),
            tf.TensorSpec(shape=(None, max_output_length), dtype=tf.int32),
        ),
        tf.TensorSpec(shape=(None, max_output_length), dtype=tf.int32)
    )
).prefetch(tf.data.AUTOTUNE)

# Entrenar el modelo usando los datasets
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    callbacks=[
        EarlyStopping(patience=5, monitor="val_loss"),
        ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3)
    ]
)

# ===================
# Modelos para inferencia
# ===================
# Encoder para inferencia
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder para inferencia
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states,
)

# ===================
# Función para decodificar secuencias
# ===================
reverse_input_word_index = dict((i, word) for word, i in input_tokenizer.word_index.items())
reverse_output_word_index = dict((i, word) for word, i in output_tokenizer.word_index.items())

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokenizer.word_index["<start>"]

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_output_word_index.get(sampled_token_index, "")

        decoded_sentence += " " + sampled_word

        if sampled_word == "<end>" or len(decoded_sentence.split()) > max_output_length:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

# Guardar el modelo
model.save("seq2seq_model.h5")
tf.saved_model.save(model, 'tf_model')

# Exportar tokenizadores a JSON
with open("input_tokenizer.json", "w") as f:
    f.write(input_tokenizer.to_json())

with open("output_tokenizer.json", "w") as f:
    f.write(output_tokenizer.to_json())

print("Model and tokenizers exported successfully!")


Epoch 1/50
   1586/Unknown [1m521s[0m 326ms/step - accuracy: 0.0596 - loss: 2.6504

  self.gen.throw(typ, value, traceback)


[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 353ms/step - accuracy: 0.0596 - loss: 2.6494 - val_accuracy: 0.1152 - val_loss: 0.1110 - learning_rate: 0.0010
Epoch 2/50
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 359ms/step - accuracy: 0.1150 - loss: 0.0692 - val_accuracy: 0.1176 - val_loss: 0.0131 - learning_rate: 0.0010
Epoch 3/50
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 354ms/step - accuracy: 0.1162 - loss: 0.0106 - val_accuracy: 0.1177 - val_loss: 0.0042 - learning_rate: 0.0010
Epoch 4/50
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m557s[0m 351ms/step - accuracy: 0.1163 - loss: 0.0035 - val_accuracy: 0.1177 - val_loss: 0.0021 - learning_rate: 0.0010
Epoch 5/50
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 359ms/step - accuracy: 0.1163 - loss: 0.0025 - val_accuracy: 0.1177 - val_loss: 0.0023 - learnin



Model and tokenizers exported successfully!


In [None]:
def chat_with_bot(input_text):
    """
    Genera una respuesta del chatbot para un texto de entrada.
    """
    # Convertir el texto de entrada a una secuencia indexada y aplicar padding
    input_seq = pad_sequences(
        input_tokenizer.texts_to_sequences([input_text]),
        maxlen=max_input_length,
        padding="post"
    )

    # Predecir los estados del encoder
    states_value = encoder_model.predict(input_seq)

    # Crear la secuencia inicial del decoder (<start>)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokenizer.word_index["<start>"]

    # Inicializar la respuesta generada
    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        # Generar predicción del siguiente token
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Obtener el índice del token con mayor probabilidad
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_output_word_index.get(sampled_token_index, "")

        # Agregar la palabra generada a la respuesta
        if sampled_word != "<end>":
            decoded_sentence.append(sampled_word)

        # Condición de parada: token <end> o longitud máxima alcanzada
        if sampled_word == "<end>" or len(decoded_sentence) > max_output_length:
            stop_condition = True

        # Actualizar la secuencia de entrada para el decoder
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Actualizar los estados del decoder
        states_value = [h, c]

    # Retornar la respuesta generada como texto
    return " ".join(decoded_sentence)


In [18]:
print("----")
print(chat_with_bot("Quiero el algoritmo de la busqueda fibonacci en python.").replace("\\n","\n").replace("\\t","\t"))

----
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [19]:
import shutil

folder_to_zip = './tf_model'
output_zip_file = 'tf_model.zip'

shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_zip)

print(f'Carpeta comprimida exitosamente como: {output_zip_file}')

Carpeta comprimida exitosamente como: tf_model.zip
