# Entrenamiento usando las técnicas de HuggingFace

### Chequear GPU

In [1]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [2]:
print_gpu_utilization()

NVMLError_LibraryNotFound: NVML Shared Library Not Found

## Cargar datasets

Notar que estos son de tipo "Dataset", un tipo que se usa en HuggingFace comunmente.

In [2]:
from datasets import load_dataset

dataset = load_dataset("glue", "cola")
dataset_train = dataset["train"]  # Just take the training split for now
dataset_test = dataset["test"]  

Found cached dataset glue (C:/Users/joaco/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 125.54it/s]


In [3]:
print(dataset_train[0])

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.", 'label': 1, 'idx': 0}


In [4]:
print(dataset_test[0])

{'sentence': 'Bill whistled past the house.', 'label': -1, 'idx': 0}


## Tokenizar

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["sentence"], truncation=True)

dataset_train = dataset_train.map(tokenize_dataset, batched=True)

Loading cached processed dataset at C:\Users\joaco\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-362372c6dcd81c36.arrow


In [6]:
print(dataset_train)
print(dataset_train[0])

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8551
})
{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.", 'label': 1, 'idx': 0, 'input_ids': [101, 3458, 2053, 1281, 112, 189, 4417, 1142, 3622, 117, 1519, 2041, 1103, 1397, 1141, 1195, 17794, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
dataset_test = dataset_test.map(tokenize_dataset, batched=True)

Loading cached processed dataset at C:\Users\joaco\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-24398a31ef968442.arrow


## Preparar dataset y modelo

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [9]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [26]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [34]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(dataset_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [38]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import numpy as np

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, id2label=id2label, label2id=label2id)

tf_train_set = model.prepare_tf_dataset(
    dataset_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    dataset_test,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

model.compile(optimizer=optimizer) 

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

## Entrenar

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=metric_callback)

## Predecir