<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## LSTM Bot QA

### Datos
El objecto es utilizar datos disponibles del challenge ConvAI2 (Conversational Intelligence Challenge 2) de conversaciones en inglés. Se construirá un BOT para responder a preguntas del usuario (QA).\
[LINK](http://convai.io/data/)

In [25]:
!pip install --upgrade --no-cache-dir gdown --quiet


[notice] A new release of pip available: 22.2.1 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import re

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, SimpleRNN
from keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.layers import Input
from tensorflow.keras.models import load_model

In [27]:
# Descargar la carpeta de dataset
import os
import gdown
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

El dataset ya se encuentra descargado


In [28]:
# dataset_file
import json

text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario



In [29]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()

dict_keys(['dialog', 'start_time', 'end_time', 'bot_profile', 'user_profile', 'eval_score', 'profile_match', 'participant1_id', 'participant2_id'])

In [30]:
chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()    
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)
    
    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out
        
        # output sentence (decoder_output) tiene <eos>
        output_sentence = output + ' <eos>'
        # output sentence input (decoder_input) tiene <sos>
        output_sentence_input = '<sos> ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))

Cantidad de rows utilizadas: 6033


In [31]:
input_sentences[1], output_sentences[1], output_sentences_inputs[1]

('hi how are you ', 'not bad and you  <eos>', '<sos> not bad and you ')

### 2 - Preprocesamiento
Realizar el preprocesamiento necesario para obtener:
- word2idx_inputs, max_input_len
- word2idx_outputs, max_out_len, num_words_output
- encoder_input_sequences, decoder_output_sequences, decoder_targets

In [48]:
# Tokenizar las oraciones de entrada
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

# Longitud máxima de las secuencias de entrada
max_input_len = max(len(seq) for seq in input_sequences)

# Crear word2idx para las entradas
word2idx_inputs = tokenizer_inputs.word_index
num_words_input = len(word2idx_inputs) + 1

# Tokenizar las oraciones de salida
tokenizer_outputs = Tokenizer()
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

# Longitud máxima de las secuencias de salida
max_output_len = max(len(seq) for seq in output_sequences)

# Añadir tokens especiales a las oraciones de salida durante la tokenización
tokenizer_outputs = Tokenizer(filters='')
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

# Crear word2idx para las salidas
word2idx_outputs = tokenizer_outputs.word_index
word2idx_outputs['<sos>'] = tokenizer_outputs.word_index.get('<sos>', len(word2idx_outputs) + 1)
word2idx_outputs['<eos>'] = tokenizer_outputs.word_index.get('<eos>', len(word2idx_outputs) + 1)
num_words_output = len(word2idx_outputs) + 1

# Padding de las secuencias
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len)
decoder_input_sequences = pad_sequences(output_sequences_inputs, maxlen=max_output_len)
decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_output_len)


### 3 - Preparar los embeddings
Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores

In [39]:
# Suponiendo que estás usando el archivo GloVe de 300 dimensiones
embedding_dim = 300

# Cargar embeddings preentrenados (GloVe de 300 dimensiones)
embeddings_index = {}
with open('data/d4/glove_embeddings.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Crear matriz de embedding para entradas
embedding_matrix_input = np.zeros((num_words_input, embedding_dim))
for word, i in word2idx_inputs.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_input[i] = embedding_vector

# Crear matriz de embedding para salidas
embedding_matrix_output = np.zeros((num_words_output, embedding_dim))
for word, i in word2idx_outputs.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_output[i] = embedding_vector


### 4 - Entrenar el modelo
Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.

In [40]:
# Crear el modelo encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embedding = Embedding(num_words_input, embedding_dim, weights=[embedding_matrix_input], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Crear el modelo decoder
decoder_inputs = Input(shape=(max_output_len,))
decoder_embedding = Embedding(num_words_output, embedding_dim, weights=[embedding_matrix_output], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Definir el modelo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [41]:
# Convertir las secuencias de salida en una forma compatible para entrenamiento
decoder_output_sequences = np.expand_dims(decoder_output_sequences, -1)

# Entrenar el modelo
model.fit([encoder_input_sequences, decoder_input_sequences], decoder_output_sequences, batch_size=64, epochs=100, validation_split=0.2)
# Guardar el modelo completo
model.save('models/chatbot_model.keras')



Epoch 1/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 62ms/step - loss: 4.2279 - val_loss: 2.0300
Epoch 2/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 61ms/step - loss: 1.7790 - val_loss: 1.7488
Epoch 3/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - loss: 1.4610 - val_loss: 1.6125
Epoch 4/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - loss: 1.3477 - val_loss: 1.5476
Epoch 5/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - loss: 1.2280 - val_loss: 1.5078
Epoch 6/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - loss: 1.1837 - val_loss: 1.4761
Epoch 7/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - loss: 1.1255 - val_loss: 1.4562
Epoch 8/100
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - loss: 1.0839 - val_loss: 1.4430
Epoch 9/100
[1m76/76[0m [32m━━━━━━━━━

### 5 - Inferencia
Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.

In [46]:
model = load_model('models/chatbot_model.keras')
# Modelo encoder para inferencia
encoder_model_inf = Model(encoder_inputs, encoder_states)

# Modelo decoder para inferencia
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model_inf = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    states_value = encoder_model_inf.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_outputs.index_word.get(sampled_token_index, '')

        if sampled_word == '<eos>' or len(decoded_sentence) > max_output_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence


def preprocess_sentence(sentence):
    # Limpiar la oración de entrada
    sentence = clean_text(sentence)
    # Convertir la oración en una secuencia de índices
    sequence = tokenizer_inputs.texts_to_sequences([sentence])
    # Hacer padding a la secuencia para que tenga la longitud máxima de entrada
    padded_sequence = pad_sequences(sequence, maxlen=max_input_len)
    return padded_sequence

def decode_sequence(input_seq):
    # Obtener los estados internos del encoder
    states_value = encoder_model_inf.predict(input_seq)
    
    # Generar una secuencia vacía de longitud 1 con el índice del token de inicio
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    
    decoded_sentence = ''
    stop_condition = False
    while not stop_condition:
        # Obtener las predicciones del decoder
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)
        
        # Obtener el índice del token con mayor probabilidad
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_outputs.index_word.get(sampled_token_index, '')
        
        if sampled_word == '<eos>' or len(decoded_sentence.split()) > max_output_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word
        
        # Actualizar la secuencia objetivo (target_seq)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        # Actualizar los estados del decoder
        states_value = [h, c]
    
    return decoded_sentence.strip()


In [53]:
# Probar el modelo con una oración de entrada
input_sentence = "Hello, how are you?"
preprocessed_sentence = preprocess_sentence(input_sentence)
decoded_sentence = decode_sequence(preprocessed_sentence)
print(f'Q: {input_sentence}')
print(f'A: {decoded_sentence}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Q: Hello, how are you?
A: i m fine i ve got a job
