# LSTM - Long Short Term Memory Networks
Las redes LSTM son un tipo de red neuronal recurrente (RNN) con capacidad de retener y utilizar información a largo plazo. Mediante el uso de unidades de momoria especializadas, las LSTM son capaces de manejar dependencias a largo plazo en secuencias de datos y evitar el desvanecimiento del gradiente, problema que podía ocasionar que las RNN simples olvidaran información relevante en secuencias largas. 

Se trata de una RNN con una celda de memoria diseñada específicamente para superar el problema del desvanecimiento del gradiente. Además, su estructura de "puertas"...

![LSTM.png](.git/LSTM.png)

In [1]:
import tensorflow as tf
print(tf.__version__)

2.16.1


In [2]:
import os

import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding, Dense, Flatten, Dropout, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix,
    classification_report
)

## Cargar los datos y dividirlos en training y validation
Se cargan los datos preprocesados y se dividen en dos conjuntos: training y validation.

In [4]:
df = pd.read_csv('./data/goemotions_clean.csv', sep=",")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/goemotions_clean.csv'

In [None]:
# Dividir el dataset en train y validation
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['emotion'], test_size=0.2, random_state=0)

## Preprocesamiento de los datos
Para poder utilizar los datos en la red neuronal, es necesario preprocesarlos. El texto debe ser convertido a números, ya que las redes neuronales no pueden trabajar con texto. Para ello, se utiliza la clase Tokenizer de Keras, que convierte el texto a secuencias de números. Además, se añade padding a las secuencias para que todas tengan la misma longitud.

### Tokenizer and Padding

In [None]:
def tokenization(tokenizer: Tokenizer, X_train: pd.Series, X_test: pd.Series) -> tuple[np.ndarray, np.ndarray, int, int]:
    # Fit tokenizer on train set
    tokenizer.fit_on_texts(X_train)

    # Convert text to sequences for both train and test sets
    train_sequences = tokenizer.texts_to_sequences(X_train)
    test_sequences = tokenizer.texts_to_sequences(X_test)

    # Get length of the longest sequence
    max_seq_len = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in test_sequences))

    # Get vocabulary size
    vocab_size = len(tokenizer.word_index) + 1  # +1 for zero padding
    
    # Applying padding to both train and test sets
    train_padded = pad_sequences(train_sequences, maxlen=max_seq_len, padding="post")
    test_padded = pad_sequences(test_sequences, maxlen=max_seq_len, padding="post")

    return train_padded, test_padded, max_seq_len, vocab_size, tokenizer

In [None]:
# Define tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
# Obtain padded train and test sequences, length of longest sequence and vocabulary size
train_padded, val_padded, max_seq_len, vocab_size, tokenizer = tokenization(tokenizer, X_train, X_val)

## Modelo de la red neuronal LSTM

In [None]:
# Número de clases (emociones diferentes)
num_classes = len(df['emotion'].unique())
print(f'Número de clases (emociones): {num_classes}')

In [None]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len)) # Capa de Embedding

model.add(LSTM(32))
model.add(Dense(num_classes, activation='softmax')) # Capa de salida con una neurona por cada clase y función de activación softmax para obtener probabilidades

# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [14]:
models_dir = "./models"
os.makedirs(models_dir, exist_ok=True)

In [15]:
model_path = os.path.join(models_dir, "cnn_model.keras")

In [16]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint(model_path, monitor='val_loss', 
                     mode='min', verbose=1, save_best_only=True)

In [None]:
all_possible_labels = np.unique(np.concatenate((y_train, y_val)))  # Recolecta todas las etiquetas únicas

label_encoder = LabelEncoder()
label_encoder.fit(np.append(all_possible_labels, 'desconocido'))  # Agrega 'desconocido' al final

# Transformar las etiquetas de entrenamiento directamente
y_train_encoded = label_encoder.transform(y_train)

# Verificar y reemplazar las etiquetas desconocidas en validación
y_val_safe = np.where(np.isin(y_val, label_encoder.classes_), y_val, 'desconocido')
y_val_encoded = label_encoder.transform(y_val_safe)

In [None]:
num_classes = len(label_encoder.classes_)

y_train_encoded = to_categorical(y_train_encoded, num_classes)
y_val_encoded = to_categorical(y_val_encoded, num_classes)

In [None]:
print("train_padded shape:", train_padded.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("val_padded shape:", val_padded.shape)
print("y_val_encoded shape:", y_val_encoded.shape)

In [None]:
history = model.fit(
    train_padded, y_train_encoded,
    epochs=20,
    batch_size=64,
    shuffle=True,
    validation_data=(val_padded, y_val_encoded),
    callbacks=[mc]
)

In [None]:
print(model.summary())

In [None]:
def plot_metric_curves(epochs, train_curve, val_curve, train_color, val_color, metric, epochs_interval: int = 1):
    # Set graphics format as svg
    set_matplotlib_formats('svg')
    # Add grid
    sns.set_style("whitegrid")
    # Set figure size
    plt.figure(figsize=(12,8), dpi=200)
    # Plot curves
    plt.plot(epochs, train_curve, color=train_color, linewidth=2, label=f'Training {metric.lower()}')
    plt.plot(epochs, val_curve, color=val_color, linewidth=2, label=f'Validation {metric.lower()}')
    # Set title and labels
    plt.title(f'Training and validation {metric.lower()}', fontsize=20)
    plt.xlabel("Epochs", fontsize=15)
    plt.ylabel(metric.capitalize(), fontsize=15)
    # Show epoch values in x-axis in the specified interval
    plt.xticks(epochs[::epochs_interval])
    # Set max x-axis as the last epoch + 1
    plt.xlim(0, epochs[-1] + 1)
    # Remove top and right spines
    sns.despine(left=True, bottom=True)
    # Put the legend outside of the plot
    plt.legend(frameon=False, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=13)
    plt.show()
    return

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

### Predicciones

In [None]:
# Predecir las emociones en el conjunto de validación
y_pred_probs = model.predict(val_padded)
y_val_label_encoded = np.argmax(y_val_encoded, axis=1)
y_val_labels = label_encoder.inverse_transform(y_val_label_encoded)


emotion_labels = label_encoder.classes_

In [None]:
emotion_to_emoji = {
    'admiration': '🤩',
    'amusement': '😄',
    'anger': '😡',
    'annoyance': '😑',
    'approval': '👍',
    'caring': '🥰',
    'confusion': '😕',
    'curiosity': '🤔',
    'desire': '😏',
    'disappointment': '😞',
    'disapproval': '👎',
    'disgust': '🤢',
    'embarrassment': '😳',
    'excitement': '😃',
    'fear': '😨',
    'gratitude': '🙏',
    'joy': '😀',
    'love': '❤️',
    'neutral': '😐',
    'optimism': '😊',
    'realization': '😲',
    'sadness': '😢',
    'surprise': '😮'
}

In [None]:
# Dataframe con texto, emoción real y top 3 de emociones predichas con sus probabilidades
df_results = pd.DataFrame(columns=['text', 'top1', 'top2', 'top3'])
df_results['text'] = X_val

# Obtener las etiquetas de las emociones predichas
top1_labels = [emotion_labels[i] for i in y_pred_probs.argsort()[:, -1]]
top2_labels = [emotion_labels[i] for i in y_pred_probs.argsort()[:, -2]]
top3_labels = [emotion_labels[i] for i in y_pred_probs.argsort()[:, -3]]

# Mapear las emociones a emojis
top1_labels = [emotion_to_emoji[label] for label in top1_labels]
top2_labels = [emotion_to_emoji[label] for label in top2_labels]
top3_labels = [emotion_to_emoji[label] for label in top3_labels]

# Obtener las probabilidades de las emociones predichas
top1_probs = y_pred_probs.max(axis=1) * 100  # Multiplicar por 100 para obtener el porcentaje
top2_probs = y_pred_probs[np.arange(len(y_val)), y_pred_probs.argsort()[:, -2]] * 100
top3_probs = y_pred_probs[np.arange(len(y_val)), y_pred_probs.argsort()[:, -3]] * 100

# Asignar valores al DataFrame
df_results['top1'] = [f'{label} {prob:.0f}%' for label, prob in zip(top1_labels, top1_probs)]
df_results['top2'] = [f'{label} {prob:.0f}%' for label, prob in zip(top2_labels, top2_probs)]
df_results['top3'] = [f'{label} {prob:.0f}%' for label, prob in zip(top3_labels, top3_probs)]

# Resetear el índice
df_results.reset_index(drop=True, inplace=True)

df_results.head()
