In [1]:
#PyTorch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch

#transformer
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

#dataset
from datasets import load_dataset, load_metric
import pandas as pd
import numpy as np

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#utils
from sklearn import metrics
from tqdm.notebook import tqdm_notebook

## Load datasets

In [2]:
alloy_ds = load_dataset("csv", data_files="../wp/alloy_dataset")["train"].train_test_split()

Using custom data configuration default-949b4c935a5a2702
Reusing dataset csv (/home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-07097a6952bc036c.arrow and /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-e997915aa4f47831.arrow


In [3]:
alloy_ds["train"].__getitem__(10)

{'label': 12, 'text': 'se descompuso mí hermano'}

## Load model and tokenizer

In [4]:
# create the tokenizer and the model

#tokenizer
tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False, num_labels=19)

#model
base_model = BertForSequenceClassification.from_pretrained("pytorch/", num_labels=19)

Some weights of the model checkpoint at pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pytorch/

### Custom parameters

In [5]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 15
LEARNING_RATE = 1e-05
OUTPUT_DIR = "./fine_tuned_BETO"

## Tokenize dataset

In [6]:
def tokenize_message(example):
    return tokenizer(example["text"], padding="max_length",truncation=True, max_length=MAX_LEN)

In [7]:
tokenized_datasets = alloy_ds.map(tokenize_message, batched=True)

Loading cached processed dataset at /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-652f350fec20e2ea.arrow
Loading cached processed dataset at /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-0e1583662dafa95d.arrow


In [8]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

Loading cached shuffled indices for dataset at /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-9c7d8d9b18c721a7.arrow
Loading cached shuffled indices for dataset at /home/juan/.cache/huggingface/datasets/csv/default-949b4c935a5a2702/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-1bf39aac59652fc8.arrow


In [9]:
full_eval_dataset.__getitem__(0)

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [10]:
training_args = TrainingArguments(num_train_epochs=EPOCHS, per_device_train_batch_size=TRAIN_BATCH_SIZE, output_dir=OUTPUT_DIR)

## Train model

In [11]:
metric = load_metric("accuracy")

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
trainer = Trainer(model=base_model, args=training_args,
                  train_dataset=small_train_dataset,
                  eval_dataset=small_eval_dataset,
                  compute_metrics=compute_metrics)

In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 10000
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18750


Step,Training Loss
500,2.4251
1000,2.3688
1500,2.3595
2000,2.3482
2500,2.3478
3000,2.3543
3500,2.3156


Saving model checkpoint to ./fine_tuned_BETO/checkpoint-500
Configuration saved in ./fine_tuned_BETO/checkpoint-500/config.json
Model weights saved in ./fine_tuned_BETO/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./fine_tuned_BETO/checkpoint-1000
Configuration saved in ./fine_tuned_BETO/checkpoint-1000/config.json
Model weights saved in ./fine_tuned_BETO/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./fine_tuned_BETO/checkpoint-1500
Configuration saved in ./fine_tuned_BETO/checkpoint-1500/config.json
Model weights saved in ./fine_tuned_BETO/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./fine_tuned_BETO/checkpoint-2000
Configuration saved in ./fine_tuned_BETO/checkpoint-2000/config.json
Model weights saved in ./fine_tuned_BETO/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./fine_tuned_BETO/checkpoint-2500
Configuration saved in ./fine_tuned_BETO/checkpoint-2500/config.json
Model weights saved in ./fine_tuned_BETO/checkpoint-2

RuntimeError: [enforce fail at inline_container.cc:298] . unexpected pos 40576 vs 40516

In [None]:
trainer.evaluate()

In [None]:
trainer.predict()