<div style="background:#FFFFE0;padding:20px;color:#000000;margin-top:10px;">
Imports necesarios para la ejecución de los módulos instalados con pip:

• pandas → import pandas as pd  
• numpy → import numpy as np  
• matplotlib → import matplotlib.pyplot as plt  
• seaborn → import seaborn as sns  
• scikit-learn → from sklearn.model_selection import train_test_split  
                    from sklearn.metrics import classification_report, confusion_matrix  
• torch → import torch  
• transformers → from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding  
• datasets → from datasets import Dataset  
</div>


In [1]:
#pip install transformers datasets scikit-learn pandas matplotlib seaborn torch
#!pip install "transformers[torch]" --upgrade


In [2]:
#!pip install "transformers[torch]" --upgrade
#!pip install --upgrade transformers


In [3]:
#!pip install --upgrade transformers accelerate


<div style="background:#FFFFE0;padding:20px;color:#000000;margin-top:10px;">
Este bloque de código verifica si PyTorch puede usar la GPU (usualmente con CUDA) y cuál GPU está disponible. Es útil para asegurarse de que el entrenamiento del modelo se pueda hacer con aceleración por hardware, lo que reduce significativamente el tiempo.</div>


In [4]:
import torch

print("CUDA disponible:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA disponible: True
GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


# Clasificacion automatica de poemas segun su forma poetica(usando la carpeta forms)

In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
import pandas as pd

ruta_base = "archive/forms"

# Inicializamos listas vacías
textos = []
etiquetas = []

# Recorrer cada carpeta (que es una clase)
for nombre_carpeta in os.listdir(ruta_base):
    ruta_carpeta = os.path.join(ruta_base, nombre_carpeta)
    if os.path.isdir(ruta_carpeta):
        for archivo in os.listdir(ruta_carpeta):
            ruta_archivo = os.path.join(ruta_carpeta, archivo)
            try:
                with open(ruta_archivo, 'r', encoding='utf-8') as f:
                    contenido = f.read().strip()
                    textos.append(contenido)
                    etiquetas.append(nombre_carpeta)
            except:
                continue

# Crear el DataFrame
df = pd.DataFrame({'text': textos, 'label': etiquetas})

# Ver los primeros datos
print(df.head())
print(df['label'].value_counts().to_string())


                                                text label
0  2 ABC of H.k. and China revised vision.\nBarre...   abc
1  Apparently life without love, is no life at al...   abc
2  A abc angles on angels flaws (poem)\nMix with ...   abc
3  A abc Brazil dance (poem)\nJack of crack in po...   abc
4  ABC... I can't go on\n123... what's the next o...   abc
label
acrostic                       100
allegory                       100
free-verse                     100
cinquain                       100
cavatina                       100
ballad                         100
ballade                        100
tetractys                      100
triolet                        100
villanelle                     100
stanza                         100
syllabic-verse                 100
epigram                        100
dirge                          100
clerihew                       100
epitaph                        100
elegy                          100
epistle                        100
verse     

In [7]:
from sklearn.preprocessing import LabelEncoder

clases_deseadas = ['haiku', 'sonnet']
df_binario = df[df['label'].isin(clases_deseadas)].reset_index(drop=True)

le = LabelEncoder()
df_binario['label_id'] = le.fit_transform(df_binario['label'])

print(df_binario.head())
print(df_binario['label'].value_counts())
print(df_binario['label_id'].value_counts())
print(le.classes_)  # para ver cuál es 0 y cuál es 1


                                                text  label  label_id
0  haiku one:\nin criss-crossing shadows\nof the ...  haiku         0
1  war memorial\nreccuring, late dad's words\ntha...  haiku         0
2  I’m down: <(here’s the news\nJust cause I’m wh...  haiku         0
3  The harvest soon comes\nA hawk's eye is needed...  haiku         0
4  I'm here\namongst the huge pile of haiku.\nCan...  haiku         0
label
haiku     99
sonnet    79
Name: count, dtype: int64
label_id
0    99
1    79
Name: count, dtype: int64
['haiku' 'sonnet']


In [8]:
from transformers import BertTokenizer

# Cargar el tokenizer de BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

dataset = Dataset.from_pandas(df_binario[['text', 'label_id']])
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")



Map: 100%|██████████| 178/178 [00:00<00:00, 646.15 examples/s]


In [9]:
# Dividir en entrenamiento y prueba (80% - 20%)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Asignar a variables por claridad
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# Confirmar tamaños
print(f"Entrenamiento: {len(train_dataset)} ejemplos")
print(f"Evaluación: {len(eval_dataset)} ejemplos")



Entrenamiento: 142 ejemplos
Evaluación: 36 ejemplos


In [10]:
from transformers import BertForSequenceClassification

# Cargar modelo BERT para clasificación (2 clases)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

# Preparar colador de datos
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Argumentos básicos de entrenamiento compatibles
training_args = TrainingArguments(
    output_dir="./resultados",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs"
)

# Definir Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Entrenar
trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained("./modelo_poemas")
tokenizer.save_pretrained("./modelo_poemas")


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Cargar modelo entrenado
modelo_entrenado = BertForSequenceClassification.from_pretrained("./modelo_poemas")
tokenizer_entrenado = BertTokenizer.from_pretrained("./modelo_poemas")


In [None]:
import torch

poema = """One of the four great masters of Japanese haiku, Matsuo Bashō is known for his simplistic yet thought-provoking haikus. “The Old Pond”, arguably his most famous piece, stays true to his style of couching observations of human nature within natural imagery. One interpretation is that by metaphorically using the ‘pond’ to symbolize the mind, Bashō brings to light the impact of external stimuli (embodied by the frog, a traditional subject of Japanese poetry) on the human mind. 
"""

# Preparar input
inputs = tokenizer_entrenado(poema, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
inputs = {k: v.to(modelo_entrenado.device) for k, v in inputs.items()}

# Predecir
modelo_entrenado.eval()
with torch.no_grad():
    outputs = modelo_entrenado(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()

print("Predicción:", le.classes_[predicted_class_id])  


Predicción: haiku


<div style="background:#FFFFE0;padding:20px;color:#000000;margin-top:10px;">
Poema en Sonnet para prueba: https://www.poetryfoundation.org/poems/45087/sonnet-18-shall-i-compare-thee-to-a-summers-day</div>
