# Generación de Texto con LSTM usando 'Cien Años de Soledad'

En este cuaderno, vamos a implementar un modelo LSTM que será entrenado usando el texto del libro 'Cien Años de Soledad' de Gabriel García Márquez.
El objetivo es que el modelo aprenda el estilo literario y sea capaz de generar texto similar al del autor.

## Requisitos previos
- Python 3.7+
- TensorFlow
- Numpy
- Matplotlib

## Objetivo
1. Modelo de clasificación con LSTM y sBERT

## 1. Cargando y Preprocesando el Texto
Cargaremos el texto de 'Cien Años de Soledad' y lo preprocesaremos para convertirlo en secuencias de texto adecuadas para el entrenamiento del modelo LSTM.

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import requests

# Cargar el texto de 'reviews_booking_limpio'
url = 'https://raw.githubusercontent.com/Izainea/nlp_ean/refs/heads/main/Datos/Datos%20Crudos/reviews_booking_limpio.csv'
DF=pd.read_csv(url)

# Tokenizar el texto
tokenizer = Tokenizer()
tokenizer.fit_on_texts(DF['Comentarios'])
total_words = len(tokenizer.word_index) + 1
max_sequence_len = 100

# sequences

sequences=tokenizer.texts_to_sequences(DF['Comentarios'])
padded = pad_sequences(sequences, maxlen=max_sequence_len, padding='post', truncating='post')

padded.shape


(15000, 100)

In [8]:
DF['Clas'].unique().shape[0]

2

In [9]:
# Dividimos el dataset en entrenamiento y test
from sklearn.model_selection import train_test_split

y = tf.keras.utils.to_categorical(DF['Clas']=='Positivos', num_classes=DF['Clas'].unique().shape[0])

X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(12000, 100) (12000, 2) (3000, 100) (3000, 2)


## 2. Creando el Modelo LSTM
Ahora crearemos el modelo LSTM que será entrenado para predecir la siguiente palabra en una secuencia, basado en el estilo literario del libro.

In [10]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



# Crear el modelo LSTM
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(2, activation='sigmoid'))

# Compilar el modelo
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Resumen del modelo
model.summary()



## 3. Entrenando el Modelo
Entrenaremos el modelo durante 100 épocas para que aprenda las secuencias de texto y las relaciones entre palabras.

In [12]:
# Entrenar el modelo
history = model.fit(X_train, y_train, epochs=3, verbose=1)

Epoch 1/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.5784 - loss: 0.6406
Epoch 2/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6308 - loss: 0.5947
Epoch 3/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9534 - loss: 0.1772


In [13]:
history = model.fit(X_train, y_train, epochs=3, verbose=1)

Epoch 1/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.9806 - loss: 0.0714
Epoch 2/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9901 - loss: 0.0443
Epoch 3/3
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9931 - loss: 0.0327


## Evaluación del modelo



In [14]:
# Validemos el modelo con la base de testeo

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.1207
Test Accuracy: 0.9697


In [15]:
### Calculemos la matriz de confusión
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)




[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(np.argmax(y_test, axis=1), y_pred_classes)


array([[1480,   27],
       [  64, 1429]])

In [17]:
## Veamos el reporte

from sklearn.metrics import classification_report

print(classification_report(np.argmax(y_test, axis=1), y_pred_classes))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1507
           1       0.98      0.96      0.97      1493

    accuracy                           0.97      3000
   macro avg       0.97      0.97      0.97      3000
weighted avg       0.97      0.97      0.97      3000



In [18]:
## Hagamos una función para evaluar comentarios

def evaluate_comment(comment, tokenizer, model, max_sequence_len):
  text = [comment]
  sequences = tokenizer.texts_to_sequences(text)
  padded = pad_sequences(sequences, maxlen=max_sequence_len, padding='post', truncating='post')
  prediction = model.predict(padded)
  prediction = np.argmax(prediction, axis=1)
  if prediction == 0:
    prediction = 'Negativo'
  else:
    prediction = 'Positivo'
  return prediction




In [19]:
### Testeo

evaluate_comment('Buena limpieza, buena locación y buena atención al cliente', tokenizer, model, max_sequence_len)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step


'Positivo'

In [20]:
evaluate_comment('No me gustó la experiencia', tokenizer, model, max_sequence_len)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


'Negativo'

### Otro modelo de clasificación con sbert

In [21]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Cargar el texto de 'reviews_booking_limpio'
url = 'https://raw.githubusercontent.com/Izainea/nlp_ean/refs/heads/main/Datos/Datos%20Crudos/reviews_booking_limpio.csv'
DF = pd.read_csv(url)

  from tqdm.autonotebook import tqdm, trange


In [22]:
# Preprocesar los datos (codificar etiquetas)
DF['Clas'] = [1 if x == "Positivos" else 0 for x in DF["Clas"]]

# Usar Sentence Transformers para obtener embeddings
model_sbert = SentenceTransformer('all-mpnet-base-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
# Obtener embeddings para los textos
DF['embeddings'] = DF['Comentarios'].apply(lambda x: model_sbert.encode(x))
DF

Unnamed: 0,Comentarios,Clas,embeddings
0,Se siente mucha bulla en las noches en las hab...,0,"[-0.015971113, 0.031234734, -0.02520684, 0.008..."
1,Ya lo expuse \nAdemás solicite un jabón adicio...,0,"[-0.012707304, 0.06309698, 0.009242087, 0.0164..."
2,Muy descuidado en los muebles...muy viejos y e...,0,"[-0.03520637, 0.021193232, -0.017398506, -0.00..."
3,La reservación de Booking no estaba coordinada...,0,"[-0.022038227, -0.023957377, -0.040487465, 0.0..."
4,parece desatendido las puertas no cierran muy ...,0,"[-0.041126333, -0.027302383, -0.030571846, 0.0..."
...,...,...,...
14995,La locación de el hotel -10 min caminando hast...,1,"[-0.040913243, -0.032258052, -0.03312944, 0.01..."
14996,"Tienen una buena ubicación, tiendas cerca para...",1,"[-0.055083778, 0.017223913, -0.01679453, -0.04..."
14997,"La ubicación, el desayuno delicioso y variado,...",1,"[-0.031009773, 0.061379317, -0.0065872753, -0...."
14998,Me ha gustado mucho la atención del personal y...,1,"[-0.041406993, 0.052447803, -0.032616425, 0.00..."


In [24]:

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    list(DF['embeddings']), DF['Clas'], test_size=0.2, random_state=42
)


In [25]:
# Entrenar un clasificador (Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = classifier.predict(X_test)

# Evaluar el modelo
print(classification_report(y_test, y_pred))

# Función para clasificar nuevos comentarios
def classify_comment(comment):
  embedding = model_sbert.encode(comment)
  prediction = classifier.predict([embedding])
  return "Positivos" if prediction[0] == 1 else "Negativos"

# Ejemplo de uso
comment_to_classify = "El hotel estuvo genial"
predicted_class = classify_comment(comment_to_classify)
print(f"El comentario '{comment_to_classify}' es: {predicted_class}")

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1507
           1       0.95      0.94      0.94      1493

    accuracy                           0.94      3000
   macro avg       0.95      0.94      0.94      3000
weighted avg       0.95      0.94      0.94      3000

El comentario 'El hotel estuvo genial' es: Positivos


## SBERT

In [29]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Cargar el texto de 'reviews_booking_limpio'
url = 'https://raw.githubusercontent.com/Izainea/nlp_ean/refs/heads/main/Datos/Datos%20Crudos/reviews_booking_limpio.csv'
DF = pd.read_csv(url)

# Preprocesar los datos (codificar etiquetas)
DF['Clas'] = [1 if x == "Positivos" else 0 for x in DF["Clas"]]
DF = DF.rename(columns={"Comentarios": "text", "Clas": "label"})

# Dividir el conjunto de datos en entrenamiento y prueba
train_texts, val_texts, train_labels, val_labels = train_test_split(
    DF['text'].tolist(), DF['label'].tolist(), test_size=0.2, random_state=42
)

# Cargar el tokenizer y el modelo BERT preentrenado
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Función para tokenizar los datos
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Crear un dataset de Hugging Face
from datasets import Dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# Aplicar la función de preprocesamiento
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)

# Definir las métricas de evaluación
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Crear el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Entrenar el modelo
trainer.train()

# Evaluar el modelo
eval_results = trainer.evaluate()
print(f"**Resultados de la evaluación:**\n{eval_results}")

# Función para clasificar nuevos comentarios
def classify_comment(comment):
    inputs = tokenizer(comment, padding="max_length", truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax(-1).item()
    return "Positivos" if predicted_class_id == 1 else "Negativos"

# Ejemplo de uso
comment_to_classify = "El hotel estuvo genial"
predicted_class = classify_comment(comment_to_classify)
print(f"El comentario '{comment_to_classify}' es: {predicted_class}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
