In [1]:
!pip install transformers[torch]
!pip install sentencepiece evaluate --quiet

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [3]:
# #device = "cuda:0" # Si estás en Windows y tienes una GPU compatible con CUDA instalada
# device = "cuda:0" if torch.cuda.is_available() else "cpu" # Utiliza GPU si está disponible, de lo contrario, usa CPU
# print(device)

In [4]:
X_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_train_5000_mayus.csv')['Cuerpo']
X_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_test_5000_mayus.csv')['Cuerpo']
y_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_train_5000.csv')['Periódico']
y_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_test_5000.csv')['Periódico']

In [5]:
label_map = {label: index for index, label in enumerate(np.unique(y_train))}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [7]:
X_test

0       El mercado inmobiliario es, a veces, mucho más...
1       Aaron Donald es el mejor defensor de la última...
2       Don Mariano Rajoy Sobredo, padre del president...
3       Lío importante anoche en el estadio Santiago B...
4       Con huelga o sin huelga de caseteros, habrá Fe...
                              ...                        
1489    Se llama Optimus. Mide 1,73. Pesa 57 kilos. Ca...
1490    Si la taxonomía verde es un indicador, el año ...
1491    Nada más comprobar que la moción de censura pr...
1492    El Gobierno conservador del Reino Unido sabe q...
1493    La Consejería de Economía, Hacienda y Empleo h...
Name: Cuerpo, Length: 1494, dtype: object

In [8]:
num_classes = len(y_train.unique())

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# AHORA EL MODELO SÍ DISTINGUE MAYUSCULAS Y MINUSCULAS

In [10]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [11]:
# Crear el dataset personalizado para PyTorch
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)


In [13]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [14]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    output_dir="./results"
)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=num_classes).to(device)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Crear el entrenador y entrenar el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.487541,0.820616
2,0.675300,0.353866,0.854752
3,0.369000,0.348344,0.892905


TrainOutput(global_step=1308, training_loss=0.46423280931758587, metrics={'train_runtime': 1192.966, 'train_samples_per_second': 8.764, 'train_steps_per_second': 1.096, 'total_flos': 2750900179461120.0, 'train_loss': 0.46423280931758587, 'epoch': 3.0})

In [18]:
test_preds = []
for i in range(len(X_test)):
    val_encoding = tokenizer(X_test.iloc[i], truncation=True, padding=True, return_tensors="pt").to(device)
    outputs = model(**val_encoding)
    logits = outputs.logits.cpu().detach().numpy()
    test_preds.append(np.argmax(logits))

In [26]:
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

accuracy del train:


0.9133428981348637

In [27]:
print('accuracy del test:')
trainer.evaluate(test_dataset)['eval_accuracy']

accuracy del test:


0.892904953145917