In [1]:
!pip install transformers==4.36.2
!pip install peft==0.7.1
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install tqdm==4.66.1
!pip install wandb==0.16.2

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.36.2
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft==0.7.1)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.27.2 peft-0.7.

In [2]:
!accelerate estimate-memory t5-small --library_name transformers

Loading pretrained config for `t5-small` from `transformers`...
config.json: 100% 1.21k/1.21k [00:00<00:00, 4.98MB/s]
┌────────────────────────────────────────────────────┐
│        Memory Usage for loading `t5-small`         │
├───────┬─────────────┬──────────┬───────────────────┤
│ dtype │Largest Layer│Total Size│Training using Adam│
├───────┼─────────────┼──────────┼───────────────────┤
│float32│   62.75 MB  │230.81 MB │     923.26 MB     │
│float16│   31.38 MB  │115.41 MB │     461.63 MB     │
│  int8 │   15.69 MB  │ 57.7 MB  │     230.81 MB     │
│  int4 │   7.84 MB   │ 28.85 MB │     115.41 MB     │
└───────┴─────────────┴──────────┴───────────────────┘


In [3]:
from datasets import load_dataset

dataset_name="financial_phrasebank"
subset_name="sentences_allagree"

dataset=load_dataset(dataset_name, subset_name)
dataset=dataset["train"].train_test_split(test_size=0.1)
dataset["validation"]=dataset["test"]
del dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [4]:
classes=dataset["train"].features["label"].names
dataset=dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1,
)

Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

In [5]:
dataset["train"][1]

{'sentence': 'Earnings per share were higher at 0.48 against 0.37 a year before and ahead of market consensus of 0.40 eur .',
 'label': 2,
 'text_label': 'positive'}

In [6]:
from transformers import AutoTokenizer

model_name="t5-small"

tokenizer=AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
text_column="sentence"
label_column="text_label"
max_length=128
batch_size=8

def preprocess_function(examples):
    inputs=examples[text_column]
    targets=examples[label_column]
    model_inputs=tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    labels=tokenizer(
        targets,
        max_length=2,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    labels=labels["input_ids"]
    labels[labels==tokenizer.pad_token_id]=-100 # ignore tokenizer pad token in the loss
    model_inputs["labels"]=labels
    return model_inputs

processed_datasets=dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]

In [8]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset=processed_datasets["train"]
eval_dataset=processed_datasets["validation"]

train_dataloader=DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True
)

eval_dataloader=DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True
)

In [9]:
from peft import TaskType, get_peft_model
from peft import PrefixTuningConfig
from transformers import AutoModelForSeq2SeqLM

peft_config=PrefixTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    num_virtual_tokens=20
)

model=AutoModelForSeq2SeqLM.from_pretrained(model_name)
model=get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 122,880 || all params: 60,629,504 || trainable%: 0.20267360260773368


In [10]:
import torch
from transformers import get_linear_schedule_with_warmup

num_epochs=5

optimizer=torch.optim.AdamW(model.parameters(), lr=1e-2)
lr_scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader)*num_epochs),
)

In [11]:
from accelerate import Accelerator
from tqdm import tqdm

accelerator=Accelerator(log_with="wandb")
accelerator.init_trackers("prefix_tuning")

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler=accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

device=accelerator.device
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss=0

    for step, batch in enumerate(tqdm(train_dataloader)):
        batch={k:v.to(device) for k,v in batch.items()}
        outputs=model(**batch)
        loss=outputs.loss

        accelerator.log({"step":step, "training_loss":loss})
        total_loss+=loss.detach().float()

        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss=0
    eval_preds=[]
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch={k:v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs=model(**batch)
        loss=outputs.loss

        accelerator.log({"eval_loss":loss})
        eval_loss+=loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(
                torch.argmax(outputs.logits,-1).detach().cpu().numpy(),
                skip_special_tokens=True
            )
        )
    eval_epoch_loss=eval_loss/len(eval_dataloader)
    eval_ppl=torch.exp(eval_epoch_loss)
    train_epoch_loss=total_loss/len(train_dataloader)
    train_ppl=torch.exp(train_epoch_loss)
    accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
accelerator.end_training()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


100%|██████████| 255/255 [00:10<00:00, 23.54it/s]
100%|██████████| 29/29 [00:00<00:00, 48.14it/s]


epoch=0: train_ppl=tensor(4.1035, device='cuda:0') train_epoch_loss=tensor(1.4118, device='cuda:0') eval_ppl=tensor(1.3161, device='cuda:0') eval_epoch_loss=tensor(0.2747, device='cuda:0')


100%|██████████| 255/255 [00:09<00:00, 25.98it/s]
100%|██████████| 29/29 [00:00<00:00, 48.20it/s]


epoch=1: train_ppl=tensor(1.4018, device='cuda:0') train_epoch_loss=tensor(0.3377, device='cuda:0') eval_ppl=tensor(1.2092, device='cuda:0') eval_epoch_loss=tensor(0.1899, device='cuda:0')


100%|██████████| 255/255 [00:10<00:00, 24.64it/s]
100%|██████████| 29/29 [00:00<00:00, 40.60it/s]


epoch=2: train_ppl=tensor(1.3568, device='cuda:0') train_epoch_loss=tensor(0.3051, device='cuda:0') eval_ppl=tensor(1.1826, device='cuda:0') eval_epoch_loss=tensor(0.1677, device='cuda:0')


100%|██████████| 255/255 [00:08<00:00, 29.02it/s]
100%|██████████| 29/29 [00:00<00:00, 50.59it/s]


epoch=3: train_ppl=tensor(1.2973, device='cuda:0') train_epoch_loss=tensor(0.2603, device='cuda:0') eval_ppl=tensor(1.1599, device='cuda:0') eval_epoch_loss=tensor(0.1483, device='cuda:0')


100%|██████████| 255/255 [00:09<00:00, 27.05it/s]
100%|██████████| 29/29 [00:00<00:00, 47.50it/s]


epoch=4: train_ppl=tensor(1.2886, device='cuda:0') train_epoch_loss=tensor(0.2536, device='cuda:0') eval_ppl=tensor(1.1641, device='cuda:0') eval_epoch_loss=tensor(0.1520, device='cuda:0')


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval_loss,█▆▆▃▄█▇▃▆▄▅▅▃▃▂▃▆▂▄▃▂▃▇▃▅▃▆▄▃▂▁▃▆▂▄▃▂▃▅▁
step,▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█
training_loss,█▃▁▂▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval_loss,0.02177
step,254.0
training_loss,0.75327


In [12]:
correct=0
total=0
for pred,true in zip(eval_preds,dataset["validation"]["text_label"]):
    if pred.strip()==true.strip():
        correct+=1
    total+=1
accuracy=correct/total*100
accuracy

89.42731277533039