In [1]:
!pip install transformers[torch]
!pip install sentencepiece evaluate --quiet

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments

In [3]:
X_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_train_5000_mayus.csv')['Cuerpo']
X_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/X_test_5000_mayus.csv')['Cuerpo']
y_train = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_train_5000.csv')['Periódico']
y_test = pd.read_csv('https://raw.githubusercontent.com/psochando/sanchez.github.io/main/y_test_5000.csv')['Periódico']

In [4]:
label_map = {label: index for index, label in enumerate(np.unique(y_train))}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [6]:
X_test

0       El mercado inmobiliario es, a veces, mucho más...
1       Aaron Donald es el mejor defensor de la última...
2       Don Mariano Rajoy Sobredo, padre del president...
3       Lío importante anoche en el estadio Santiago B...
4       Con huelga o sin huelga de caseteros, habrá Fe...
                              ...                        
1489    Se llama Optimus. Mide 1,73. Pesa 57 kilos. Ca...
1490    Si la taxonomía verde es un indicador, el año ...
1491    Nada más comprobar que la moción de censura pr...
1492    El Gobierno conservador del Reino Unido sabe q...
1493    La Consejería de Economía, Hacienda y Empleo h...
Name: Cuerpo, Length: 1494, dtype: object

In [7]:
num_classes = len(y_train.unique())

In [8]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

In [9]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length = 2000)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length = 2000)

In [10]:
# Crear el dataset personalizado para PyTorch
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_dataset = MyDataset(train_encodings, y_train)
test_dataset = MyDataset(test_encodings, y_test)


In [12]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [13]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=48,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    output_dir="./results"
)

In [14]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=num_classes).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Crear el entrenador y entrenar el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [16]:
trainer.train()



OutOfMemoryError: ignored

In [None]:
# test_preds = []
# for i in range(len(X_test)):
#     val_encoding = tokenizer(X_test.iloc[i], truncation=True, padding=True, return_tensors="pt").to(device)
#     outputs = model(**val_encoding)
#     logits = outputs.logits.cpu().detach().numpy()
#     test_preds.append(np.argmax(logits))

In [None]:
print('accuracy del train:')
trainer.evaluate(train_dataset)['eval_accuracy']

In [None]:
print('accuracy del test:')
trainer.evaluate(test_dataset)['eval_accuracy']

In [None]:
trainer.save_model("./XLNET_large_1_mayus")