# LSTM classificator

Using the dataset `dataset_emails.csv` (or the same dataset you have used in S08_1) create a some text classificators:
* LSTM
* GRU 

Compare the results between LSTM and GRU. Compare the results with the S08_1 methods. 


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Cargar datos
data = pd.read_csv('dataset_emails.csv')
texts = data['prompt'].values
labels = data['label'].values

# Convertir etiquetas a formato numérico
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
num_classes = len(le.classes_)
labels_onehot = to_categorical(labels_encoded, num_classes=num_classes)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(texts, labels_onehot, test_size=0.2, random_state=42)

# Tokenización y conversión a secuencias
max_words = 10000  # Tamaño del vocabulario
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding de las secuencias para que tengan la misma longitud
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Construir el modelo LSTM
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Entrenar el modelo
history = model.fit(X_train_pad, y_train, batch_size=32, epochs=10, validation_split=0.1)

# Evaluar el modelo en el conjunto de prueba
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Predicción en ejemplos nuevos
example_texts = [
    "I need to compose an email.",
    "Forward this email.",
    "Please reply to the sender.",
    "Open the email to read its content."
]
example_seq = tokenizer.texts_to_sequences(example_texts)
example_pad = pad_sequences(example_seq, maxlen=max_len)
predictions = model.predict(example_pad)
pred_labels = [le.inverse_transform([np.argmax(pred)])[0] for pred in predictions]

print("\nPredicciones de ejemplo:")
for text, pred in zip(example_texts, pred_labels):
    print(f"Input: {text} -> Predicted Label: {pred}")




Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.1758 - loss: 2.2911 - val_accuracy: 0.2250 - val_loss: 2.2569
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.3843 - loss: 2.1911 - val_accuracy: 0.3125 - val_loss: 2.0845
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.4426 - loss: 1.8765 - val_accuracy: 0.4875 - val_loss: 1.6132
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.5552 - loss: 1.4212 - val_accuracy: 0.6125 - val_loss: 1.2832
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.7344 - loss: 1.0023 - val_accuracy: 0.6625 - val_loss: 1.0401
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8282 - loss: 0.6685 - val_accuracy: 0.7875 - val_loss: 0.8232
Epoch 7/10
[1m23/23[0m [32m━━━━

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Cargar datos
data = pd.read_csv('dataset_emails.csv')
texts = data['prompt'].values
labels = data['label'].values

# Convertir etiquetas a formato numérico y one-hot encoding
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
num_classes = len(le.classes_)
labels_onehot = to_categorical(labels_encoded, num_classes=num_classes)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(texts, labels_onehot, test_size=0.2, random_state=42)

# Tokenización y conversión a secuencias
max_words = 10000  # Tamaño del vocabulario
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding de las secuencias para que tengan la misma longitud
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Construir el modelo GRU
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Entrenar el modelo
history = model.fit(X_train_pad, y_train, batch_size=32, epochs=10, validation_split=0.1)

# Evaluar el modelo en el conjunto de prueba
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Predicción en ejemplos nuevos
example_texts = [
    "I need to compose an email.",
    "Forward this email.",
    "Please reply to the sender.",
    "Open the email to read its content."
]
example_seq = tokenizer.texts_to_sequences(example_texts)
example_pad = pad_sequences(example_seq, maxlen=max_len)
predictions = model.predict(example_pad)
pred_labels = [le.inverse_transform([np.argmax(pred)])[0] for pred in predictions]

print("\nPredicciones de ejemplo:")
for text, pred in zip(example_texts, pred_labels):
    print(f"Input: {text} -> Predicted Label: {pred}")




Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 192ms/step - accuracy: 0.1938 - loss: 2.2919 - val_accuracy: 0.4500 - val_loss: 2.2430
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.5277 - loss: 2.1702 - val_accuracy: 0.4000 - val_loss: 1.9413
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.5429 - loss: 1.6791 - val_accuracy: 0.5000 - val_loss: 1.4354
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.6511 - loss: 1.2261 - val_accuracy: 0.5750 - val_loss: 1.1798
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.7340 - loss: 0.8559 - val_accuracy: 0.6625 - val_loss: 1.0044
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.8208 - loss: 0.6027 - val_accuracy: 0.7375 - val_loss: 0.8873
Epoch 7/10
[1m23/23[0m [32m━━