<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Bot 


### Alumno: Horn Martín


In [None]:
import re
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:

print("GPUs disponible:", tf.config.list_physical_devices('GPU'))

# hiperparametros seleccionados, aumente el max_len y max_vocab para evitar que responda siempre el mismo mensaje
MAX_VOCAB_SIZE = 16000
MAX_LEN = 15  
EMBEDDING_DIM = 300
LSTM_UNITS = 128

# limpieza de palabras frecuentes
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"\'d", " had", txt)
    txt = re.sub(r"\'s", " is", txt)
    txt = re.sub(r"\'m", " am", txt)
    txt = re.sub(r"don't", "do not", txt)
    txt = re.sub(r'\W+', ' ', txt)
    return txt.strip()

# carga del dataset
with open("data_volunteers.json", encoding="utf-8") as f:
    data = json.load(f)

input_sentences, output_sentences, output_sentences_inputs = [], [], []

for line in data:
    for i in range(len(line['dialog']) - 1):
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i + 1]['text'])
        if len(chat_in.split()) > MAX_LEN or len(chat_out.split()) > MAX_LEN:
            continue
        output_sentence = '<sos> ' + chat_out + ' <eos>'
        output_sentence_input = '<sos> ' + chat_out
        input_sentences.append(chat_in)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

In [None]:

# Tokenización
tokenizer_inputs = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)
word2idx_inputs = tokenizer_inputs.word_index
num_words_input = min(MAX_VOCAB_SIZE, len(word2idx_inputs) + 1)

tokenizer_outputs = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)
word2idx_outputs = tokenizer_outputs.word_index
reverse_word2idx_outputs = {idx: word for word, idx in word2idx_outputs.items()}
num_words_output = min(MAX_VOCAB_SIZE, len(word2idx_outputs) + 1)

# pading
encoder_input_sequences = pad_sequences(input_sequences, maxlen=MAX_LEN)
decoder_input_sequences = pad_sequences(output_sequences_inputs, maxlen=MAX_LEN)
decoder_target_sequences = pad_sequences(output_sequences, maxlen=MAX_LEN)

# En este punto cargo los embeddings de FastText
embedding_index = {}
with open('crawl-300d-2M.vec', encoding='utf8') as f:
    next(f)
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((num_words_input, EMBEDDING_DIM))
for word, idx in word2idx_inputs.items():
    if idx < num_words_input:
        vector = embedding_index.get(word)
        if vector is not None:
            embedding_matrix[idx] = vector

In [None]:


# Modelo propiamente dicho
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = Embedding(num_words_input, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False)(encoder_inputs)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True)
_, h, c = encoder_lstm(encoder_embedding)
encoder_states = [h, c]

decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding_layer = Embedding(num_words_output, EMBEDDING_DIM)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:


# dividimos el dataset:
total_size = len(encoder_input_sequences)
val_size = int(0.2 * total_size)

train_dataset = tf.data.Dataset.from_tensor_slices((
    (encoder_input_sequences[:-val_size], decoder_input_sequences[:-val_size]),
    decoder_target_sequences[:-val_size]
)).shuffle(buffer_size=1024).batch(256).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    (encoder_input_sequences[-val_size:], decoder_input_sequences[-val_size:]),
    decoder_target_sequences[-val_size:]
)).batch(256).prefetch(tf.data.AUTOTUNE)

# Entrenamiento
earlystop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(train_dataset, validation_data=val_dataset, epochs=50, callbacks=[earlystop])

In [None]:


# Aca separamos los modelos por inferencia
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LSTM_UNITS,))
decoder_state_input_c = Input(shape=(LSTM_UNITS,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_embed2 = decoder_embedding_layer(decoder_inputs_single)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    decoder_embed2, initial_state=decoder_states_inputs
)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

# Función de inferencia implementada:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    stop_condition = False
    decoded_sentence = []
    word_counts = {}

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word2idx_outputs.get(sampled_token_index, '')

        if sampled_word in word_counts:
            word_counts[sampled_word] += 1
        else:
            word_counts[sampled_word] = 1

        if (
            sampled_word == '<eos>' or
            sampled_word == '' or
            len(decoded_sentence) > MAX_LEN or
            word_counts[sampled_word] > 2
        ):
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return ' '.join(decoded_sentence)




✅ GPUs disponibles: []
Epoch 1/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 126ms/step - accuracy: 0.3896 - loss: 6.4897 - val_accuracy: 0.4972 - val_loss: 2.9451
Epoch 2/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - accuracy: 0.4382 - loss: 3.1560 - val_accuracy: 0.4975 - val_loss: 2.5844
Epoch 3/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - accuracy: 0.4488 - loss: 2.8064 - val_accuracy: 0.5617 - val_loss: 2.4294
Epoch 4/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 113ms/step - accuracy: 0.5200 - loss: 2.6514 - val_accuracy: 0.5820 - val_loss: 2.3261
Epoch 5/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 113ms/step - accuracy: 0.5336 - loss: 2.5335 - val_accuracy: 0.5977 - val_loss: 2.2290
Epoch 6/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - accuracy: 0.5531 - loss: 2.4094 - val_accuracy: 0.6121 - val_loss: 2.1413
Epoch

In [None]:
#  Función para consultar el modelo
def respond_to_question(input_text):
    seq = tokenizer_inputs.texts_to_sequences([clean_text(input_text)])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    return decode_sequence(padded)

# Pruebas realizadas. Son con las que obtuve resultados más cercanos a una respuesta "real" por parte del modelo

print(respond_to_question("Do you like pizza?"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21

In [13]:
print(respond_to_question("What do you do for a living?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
i am a teacher i am a teacher


No pude obtener respuestas mucho mejores que las antes mencionadas para este modelo