In [None]:
# Paso 1: Instalar las librerías necesarias
!pip install tensorflow pandas numpy scikit-learn

# Paso 2: Importar las librerías necesarias
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

# Paso 3: Cargar el dataset desde un archivo .txt
from google.colab import files
uploaded = files.upload()

# Asumiendo que cada línea del archivo contiene un diálogo en el formato "input \t response"
with open('dialogs.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

input_texts = []
response_texts = []

for line in lines:
    input_text, response_text = line.strip().split('\t')
    input_texts.append(input_text.lower())  # Convertir a minúsculas
    response_texts.append(response_text.lower())

# Convertir listas a arrays numpy
input_texts = np.array(input_texts)
response_texts = np.array(response_texts)

# Paso 4: Preprocesar los datos
tokenizer = Tokenizer(num_words=5000)  # Ajusta el vocabulario a 5000 palabras más frecuentes
tokenizer.fit_on_texts(np.concatenate((input_texts, response_texts), axis=0))

input_sequences = tokenizer.texts_to_sequences(input_texts)
response_sequences = tokenizer.texts_to_sequences(response_texts)

# Aumentar el tamaño máximo de la secuencia
max_seq_len = 30
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
response_sequences = pad_sequences(response_sequences, maxlen=max_seq_len, padding='post')

# Convertir las respuestas a one-hot encoding
response_sequences = tf.keras.utils.to_categorical(response_sequences, num_classes=len(tokenizer.word_index) + 1)

# Paso 5: Construir el modelo
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_seq_len),
    LSTM(256, return_sequences=True),  # Mantener return_sequences=True
    Dropout(0.5),
    LSTM(256, return_sequences=True),  # Mantener return_sequences=True
    Dropout(0.5),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Paso 6: Entrenar el modelo
X_train, X_val, y_train, y_val = train_test_split(input_sequences, response_sequences, test_size=0.2, random_state=42)

# Callbacks: Reduce el learning rate si la validación no mejora, y guarda el mejor modelo
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
checkpoint = ModelCheckpoint('best_chatbot_model.keras', monitor='val_loss', save_best_only=True)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, callbacks=[reduce_lr, checkpoint])

# Paso 7: Guardar el modelo
model = tf.keras.models.load_model('best_chatbot_model.keras')

def generate_response(input_text):
    input_seq = tokenizer.texts_to_sequences([input_text.lower()])  # Convertir a minúsculas
    input_seq = pad_sequences(input_seq, maxlen=max_seq_len, padding='post')
    predicted_seq = model.predict(input_seq)
    predicted_words = tokenizer.sequences_to_texts(np.argmax(predicted_seq, axis=-1))
    return ' '.join(predicted_words)

# Ejemplo de uso
print(generate_response("Hello! How are you?"))