<a href="https://colab.research.google.com/github/LCaravaggio/NLP/blob/main/09_Transformers/SequenceClf_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Transfer Learning

Vamos a hacer fine-tuning de BERT pre-entrenado para clasificar secuencias.  

Vamos a ajustar solamente los pesos de las últimas capas y congelar el resto de la red.

In [1]:
!pip install transformers datasets accelerate watermark

Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.0


In [2]:
import numpy as np
import pandas as pd
import torch
import datasets
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
)
from IPython.display import display, HTML
from sklearn.linear_model import LogisticRegression

In [3]:
%reload_ext watermark

In [4]:
%watermark -vp torch,transformers,datasets,sklearn

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

torch       : 2.1.0+cu118
transformers: 4.34.1
datasets    : 2.14.6
sklearn     : 1.2.2



In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Dataset

Vamos a resolver una de las tasks de GLUE:

[CoLA](https://nyu-mll.github.io/CoLA/) (Corpus of Linguistic Acceptability). El objetivo es determinar is una oración es gramaticalmente correcta (1) o no (0).

In [6]:
full_dataset = load_dataset("glue", "cola")

In [7]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [8]:
def show_random_elements(dataset, num_examples=10):
    """Copiado de notebook HF :)
    """
    picks = []
    for _ in range(num_examples):
        pick = np.random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = np.random.randint(0, len(dataset)-1)
        picks.append(pick)
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(full_dataset["train"], num_examples=6)

Unnamed: 0,sentence,label,idx
0,Jane played the piano.,acceptable,6947
1,"This boy must not go to France, but his father must go to France.",acceptable,7144
2,I searched treasure in the cave.,unacceptable,3024
3,Ellen conferred to Helen.,unacceptable,3076
4,What is to come is in this document.,acceptable,5088
5,Henry saw that Bill left.,acceptable,7583


In [9]:
print("distribucion de clases:")
for k in full_dataset.keys():
    print(k)
    print(pd.Series(full_dataset[k]["label"]).value_counts())
    print("-"*70)

distribucion de clases:
train
1    6023
0    2528
dtype: int64
----------------------------------------------------------------------
validation
1    721
0    322
dtype: int64
----------------------------------------------------------------------
test
-1    1063
dtype: int64
----------------------------------------------------------------------


In [10]:
print("Sentence length:")
for k in full_dataset.keys():
    print(k)
    largos = pd.Series(full_dataset[k]["sentence"]).str.len()
    print(np.quantile(largos, q=np.arange(0, 1.1, .1)).astype(int))
    print("-"*70)

Sentence length:
train
[  6  21  26  30  33  37  41  46  52  65 231]
----------------------------------------------------------------------
validation
[  9  20  25  29  33  36  42  47  56  69 157]
----------------------------------------------------------------------
test
[  7  20  25  29  33  36  41  46  53  66 152]
----------------------------------------------------------------------


## Tokenización y modelo

In [11]:
model_checkpoint = "distilbert-base-cased"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
def tokenize_fn(examples):
    """Sin aplicar padding --> lo aplicamos luego en cada batch de entrenamiento
    """
    return tokenizer(examples["sentence"], truncation=True)

In [14]:
tokenize_fn(full_dataset['train'][:3])

{'input_ids': [[101, 3458, 2053, 1281, 112, 189, 4417, 1142, 3622, 117, 1519, 2041, 1103, 1397, 1141, 1195, 17794, 119, 102], [101, 1448, 1167, 23563, 1704, 2734, 1105, 146, 112, 182, 2368, 1146, 119, 102], [101, 1448, 1167, 23563, 1704, 2734, 1137, 146, 112, 182, 2368, 1146, 119, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [15]:
tokenized_dataset = full_dataset.map(tokenize_fn, batched=True, batch_size=32)
# Estos NO son los batches de entrenamiento (podemos poner otro valor)

In [16]:
# map ignores tensor formatting while writing a cache file
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [17]:
# del full_dataset

In [18]:
# modelo con head de clf inicializado random
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Fine-tuning

Tenemos que definir una métrica para evaluar nuestro modelo en validación durante el entrenamiento.

Como el mejor modelo puede no ser el del final del entrenamiento, vamos a usar el mejor modelo guardado según nuestra métrica en validación al final del entrenamiento.

No hacemos búsqueda de hiperparámetros (como learning rate, regularización L2, etc.). Ver esto en [la notebook de HF](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb).

In [19]:
# freeze todas las capas
for param in model.parameters():
    param.requires_grad = False

In [20]:
# descongelar las ultimas capas
for param in model.pre_classifier.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True
# y el ultimo transformer block:
for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True

# tambien se puede ajustar todas (no congelar ninguna)

In [21]:
metric_name = "matthews_correlation"
metric = load_metric(metric_name)

  metric = load_metric(metric_name)


In [22]:
model_name = model_checkpoint.split("/")[-1]

In [23]:
args = TrainingArguments(
    f"{model_name}-finetuned-cola",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=33,
)

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    #print(predictions.mean())
    return metric.compute(predictions=predictions, references=labels)

In [25]:
# pasamos el tokenizer para que aplique el padding en cada batch
# la alternativa es un usar un data_collator propio
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5932,0.570368,0.242094
2,0.53,0.533467,0.318074
3,0.5108,0.531042,0.340853
4,0.4932,0.522352,0.35531
5,0.4754,0.526696,0.347737
6,0.469,0.521548,0.355225
7,0.4607,0.530288,0.347719
8,0.4485,0.537274,0.347737
9,0.4529,0.521917,0.354337
10,0.4372,0.521777,0.354956


TrainOutput(global_step=5350, training_loss=0.48321293340665156, metrics={'train_runtime': 138.0281, 'train_samples_per_second': 619.512, 'train_steps_per_second': 38.76, 'total_flos': 464443635957336.0, 'train_loss': 0.48321293340665156, 'epoch': 10.0})

In [27]:
# corremos evaluate() sobre validation data para verificar que se conservó el
# modelo de mejor performance
trainer.evaluate()

{'eval_loss': 0.5223519206047058,
 'eval_matthews_correlation': 0.3553101537388247,
 'eval_runtime': 0.7719,
 'eval_samples_per_second': 1351.287,
 'eval_steps_per_second': 85.508,
 'epoch': 10.0}

In [28]:
# vemos performance en train:
trainer.evaluate(tokenized_dataset["train"])

{'eval_loss': 0.46142926812171936,
 'eval_matthews_correlation': 0.43186975368736646,
 'eval_runtime': 7.8912,
 'eval_samples_per_second': 1083.607,
 'eval_steps_per_second': 67.797,
 'epoch': 10.0}

### Error analysis

Ejemplos con mayor loss

In [29]:
data_collator = trainer.data_collator

def loss_per_example(examples):
    """Agrega a un batch la proba, prediccion y loss de cada ejemplo
    """
    examples = {k: v for k, v in examples.items() if k in ['label', 'input_ids', 'attention_mask']}
    batch = data_collator(examples)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    with torch.inference_mode():
        output = model(input_ids, attention_mask)
        batch["proba"] = torch.softmax(output.logits, dim=1)[:, 1]
        batch["predicted_label"] = torch.argmax(output.logits, axis=1)
    # reduction="none" --> loss por example
    loss = torch.nn.functional.cross_entropy(output.logits, labels, reduction="none")
    batch["loss"] = loss
#    # antes datasets requeria list of NumPy array data types
#    for k, v in batch.items():
#        batch[k] = v.cpu().numpy()
    return batch

In [30]:
model.eval()
errors_dataset = tokenized_dataset['validation'].map(
    loss_per_example, batched=True, batch_size=16)

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

In [32]:
errors_dataset.set_format('pandas')
errors_df = errors_dataset[:][['label', 'proba', 'predicted_label', 'loss']]
# El trainer elimina in-place cualquier feature de tipo str
# --> recuperamos la columna
errors_df['sentence'] = full_dataset['validation']['sentence']

In [33]:
pd.set_option("display.max_colwidth", None)

In [34]:
# falsos positivos
errors_df.query("label == 0").sort_values("loss", ascending=False).head()

Unnamed: 0,label,proba,predicted_label,loss,sentence
206,0,0.964914,1,3.349948,I squeaked the door.
1040,0,0.961583,1,3.259269,John bought a dog for himself to play with.
202,0,0.959315,1,3.201896,My heart is pounding me.
585,0,0.955838,1,3.119897,"John and someone were dancing together, but I don't know who."
433,0,0.954712,1,3.09472,Kim and Terry is happy.


In [35]:
# falsos negativos
errors_df.query("label == 1").sort_values("loss", ascending=False).head()

Unnamed: 0,label,proba,predicted_label,loss,sentence
544,1,0.123934,0,2.088009,Joan ate dinner with someone but I don't know who with.
407,1,0.149191,0,1.902528,She asked was Alison coming to the party.
652,1,0.157415,0,1.848872,"Bob tried to wash himself, and Mary to read the funnies."
580,1,0.157599,0,1.847704,"She was dancing with somebody, but I don't know who with."
1001,1,0.192304,0,1.648676,John's arriving dead surprised me.


## Referencias

* [Notebooks de rasbt](https://github.com/rasbt/deeplearning-models#transformers)
* [Notebooks de HuggingFace](https://huggingface.co/docs/transformers/notebooks)
* [Blog de Lewis Tunstall](https://lewtun.github.io/blog/til/nlp/huggingface/transformers/2021/01/01/til-data-collator.html)