In [21]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Jul 21 07:56:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0    37W /  70W |   9927MiB / 15360MiB |     44%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
!pip install transformers[torch]
!pip install sentencepiece evaluate --quiet

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [3]:
# #device = "cuda:0" # Si estás en Windows y tienes una GPU compatible con CUDA instalada
# device = "cuda:0" if torch.cuda.is_available() else "cpu" # Utiliza GPU si está disponible, de lo contrario, usa CPU
# print(device)

In [4]:
X_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_train_5000.csv')['Cuerpo']
X_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_test_5000.csv')['Cuerpo']
y_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_train_5000.csv')['Periódico']
y_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_test_5000.csv')['Periódico']

In [5]:
label_map = {label: index for index, label in enumerate(np.unique(y_train))}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [7]:
X_test

0       el mercado inmobiliario es, a veces, mucho más...
1       aaron donald es el mejor defensor de la última...
2       don mariano rajoy sobredo, padre del president...
3       lío importante anoche en el estadio santiago b...
4       con huelga o sin huelga de caseteros, habrá fe...
                              ...                        
1489    se llama optimus. mide 1,73. pesa 57 kilos. ca...
1490    si la taxonomía verde es un indicador, el año ...
1491    nada más comprobar que la moción de censura pr...
1492    el gobierno conservador del reino unido sabe q...
1493    la consejería de economía, hacienda y empleo h...
Name: Cuerpo, Length: 1494, dtype: object

In [8]:
num_classes = len(y_train.unique())

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [11]:
# Crear el dataset personalizado para PyTorch
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)


In [13]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [14]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    output_dir="./results"
)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes).to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Crear el entrenador y entrenar el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.369916,0.858099
2,0.609900,0.327669,0.880857
3,0.352500,0.391125,0.890228


TrainOutput(global_step=1308, training_loss=0.4288633154072893, metrics={'train_runtime': 1129.0427, 'train_samples_per_second': 9.26, 'train_steps_per_second': 1.159, 'total_flos': 2750900179461120.0, 'train_loss': 0.4288633154072893, 'epoch': 3.0})

In [18]:
# test_preds = []
# for i in range(len(X_test)):
#     val_encoding = tokenizer(X_test.iloc[i], truncation=True, padding=True, return_tensors="pt").to(device)
#     outputs = model(**val_encoding)
#     logits = outputs.logits.cpu().detach().numpy()
#     test_preds.append(np.argmax(logits))

In [22]:
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

accuracy del train:


0.9388809182209469

In [23]:
print('accuracy del test:')
trainer.evaluate(test_dataset)['eval_accuracy']

accuracy del test:


0.8902275769745649