In [None]:
import torch
import numpy as np
import pandas as pd

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu" #para gpu
print(device)

cuda:0


In [None]:
X_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_train_5000_mayus.csv')['Cuerpo']
X_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_test_5000_mayus.csv')['Cuerpo']
y_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_train_5000.csv')['Periódico']
y_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_test_5000.csv')['Periódico']

In [None]:
# !pip install transformers[torch] --quiet
# !pip install accelerate -U --quiet

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Geotrend/distilbert-base-es-cased")
model = AutoModelForSequenceClassification.from_pretrained("Geotrend/distilbert-base-es-cased", num_labels=5).to(device)

Some weights of the model checkpoint at Geotrend/distilbert-base-es-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Geotrend/distilbert-base-es-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.wei

In [None]:
torch.__version__

'2.0.1+cu117'

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
type(train_encodings)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        #item es un diccionario que contiene tres keys: 'input_ids', 'attention_mask' y 'labels'
        #cada key contiene el tensor correspodiente al indice idx
        #item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item = {}
        item['input_ids'] = torch.tensor(self.encodings['input_ids'][idx]).to(device)
        item['attention_mask'] = torch.tensor(self.encodings['attention_mask'][idx]).to(device)
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)

In [None]:
print(len(train_dataset),len(test_dataset))

2738 2241


In [None]:
train_dataset[0]

{'input_ids': tensor([   11,   258, 24389, 10869, 22484,   205,  3935,    39,  3123,  3376,
            25,   551,   930,   638, 24718,   101,   259, 25928,  6853,   205,
          1775,  5918, 25422,   233,   222,  5253, 12951,   696,   238, 16451,
           233,   841,  3826,   315, 10722,   281, 10778,   391,   205,   210,
          1554,    25,   225,  4575,   101,   225,  6401,   205,  5509,   213,
           205,  2204,    27, 25422,   233,   222,   238,  3249,  1794,   210,
         21153,   205,  3935,  3088,   101,   205,   370, 14227,   227,  1778,
           315, 13268,  7119, 17289,   211,   210,  1503,   227,  6240,  1472,
          3521,    25,   225, 20478,   417,   233,  1305,  1417,   756,  9372,
           386,   263,  3680,  4243,  7423,  9489,   296,  1109,  2668,   205,
          5072,    30,    27, 25422,   233,   222,  2179, 18559,  5053,   205,
           370,   275,  2290, 15110, 15149,  6181,   205,  1528, 18904,  5667,
         25781,    25,  1625, 25939,   

In [None]:
!pip install evaluate --quiet

In [None]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=6,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    dataloader_pin_memory=False,     # remove if possible for faster training
    evaluation_strategy = "epoch",
    output_dir="./results"
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.143642,0.964748


TrainOutput(global_step=457, training_loss=0.26148804846164697, metrics={'train_runtime': 619.4795, 'train_samples_per_second': 4.42, 'train_steps_per_second': 0.738, 'total_flos': 362695737520128.0, 'train_loss': 0.26148804846164697, 'epoch': 1.0})

In [None]:
# import torch
# torch.cuda.empty_cache()

In [None]:
# test_preds = []
# for i in range(len(X_test)):
#     val_encoding = tokenizer(X_test[i], truncation=True, padding=True, return_tensors="pt").to(device)
#     outputs = model(**val_encoding)
#     logits = outputs.logits.cpu().detach().numpy()
#     test_preds.append(np.argmax(logits))

In [None]:
# Datos:
# Epochs: 1
# per_device_train_batch_size=6
# per_device_eval_batch_size=8

In [None]:
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

accuracy del train:


0.9737034331628927

In [None]:
print('accuracy del test:')
trainer.evaluate(test_dataset)['eval_accuracy']

accuracy del test:


0.9647478804105311