In [1]:
! pip install transformers==4.35.2 datasets==2.16.0 accelerate==0.25.0 peft==0.7.1 -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.0 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.0 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.0 which is incompatible.
dask-cuda 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.0 which is incompatible.
dask-cuda 23.8.0 requires pa

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import time
import gc



In [3]:
TRAINIG_EPOCH = 10
LEARNING_RATE = 3e-4
VALIDATION_EACH_EPOCH = 3
PER_DEVICE_BATCH_SIZE = 8
MAX_REQUENCE_LENGTH = 128


DATASET = ("multi_nli",)
VALIDATION_SETS = ("validation_matched","validation_mismatched",)
NUM_LABELS = 3
DATASET_TRAIN_REDUCTION = 18
DATASET_VALID_REDUCTION = 15
DATA_PROC = lambda sample: (sample['premise'], sample['hypothesis'])

MODEL_NAME = "roberta-large"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {DEVICE}")

device: cuda


In [4]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

import warnings
warnings.filterwarnings("ignore", message="torch.utils.checkpoint")

import os
os.environ["WANDB_DISABLED"] = "true"

# Load & Examine Dataset

In [5]:
from datasets import load_dataset, Dataset, concatenate_datasets

train_dataset = load_dataset(*(DATASET), split='train')
valid_dataset = concatenate_datasets([
    load_dataset(*(DATASET), split=split) for split in VALIDATION_SETS
])


Downloading readme:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

In [6]:
print(f">>> Number of Train Samples: {len(train_dataset)}")

train_dataset = Dataset.from_dict(train_dataset[::DATASET_VALID_REDUCTION])
print(f"    Number of Train Samples: {len(train_dataset)}\n")

print(train_dataset)

>>> Number of Train Samples: 392702
    Number of Train Samples: 26181

Dataset({
    features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
    num_rows: 26181
})


In [7]:
print(f">>> Number of Validation Samples: {len(valid_dataset)}")

valid_dataset = Dataset.from_dict(valid_dataset[::DATASET_VALID_REDUCTION])
print(f"    Number of Validation Samples: {len(valid_dataset)}\n")

print(valid_dataset)

>>> Number of Validation Samples: 19647
    Number of Validation Samples: 1310

Dataset({
    features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
    num_rows: 1310
})


## Prepare dataset for tuning

In [8]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [9]:
train_tokenized = train_dataset.map(
    lambda x: tokenizer(x['premise'], x['hypothesis'], truncation=True, max_length=128),
    batched=True,
)
valid_tokenized = valid_dataset.map(
    lambda x: tokenizer(x['premise'], x['hypothesis'], truncation=True, max_length=128),
    batched=True,
)

# Remove unnecessary column, rename label column, and set format to PyTorch
train_tokenized = train_tokenized.remove_columns(['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
train_tokenized = train_tokenized.rename_column('label', 'labels')
train_tokenized = train_tokenized.with_format('pt')

valid_tokenized = valid_tokenized.remove_columns(['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
valid_tokenized = valid_tokenized.rename_column('label', 'labels')
valid_tokenized = valid_tokenized.with_format('pt')

Map:   0%|          | 0/26181 [00:00<?, ? examples/s]

Map:   0%|          | 0/1310 [00:00<?, ? examples/s]

In [10]:
del train_dataset, valid_dataset
gc.collect()

125

In [11]:
steps_per_epoch = (len(train_tokenized)+PER_DEVICE_BATCH_SIZE-1) // PER_DEVICE_BATCH_SIZE
print(f"Steps per epoch: {steps_per_epoch}")

Steps per epoch: 3273


# Utils

In [12]:
def count_learnable_parameters(model: torch.nn.Module):
    num_params = sum(p.numel() for _, p in model.named_parameters() if p.requires_grad)
    return num_params

In [13]:
def count_parameters(model: torch.nn.Module):
    num_params = sum(p.numel() for _, p in model.named_parameters())
    return num_params

In [14]:
def freeze_model(model: torch.nn.Module):
  for _, param in model.named_parameters():
    param.requires_grad = False
  model.gradient_checkpointing_enable()
  model.enable_input_require_grads()

# Normal Fine-Tuning

In [15]:
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS).to(DEVICE)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
ft_learnable = count_learnable_parameters(model)
ft_all = count_parameters(model)
print( "Number of model's parameters  :", ft_all)
print( "Number of learnable parameters:", ft_learnable)
print(f"Learnable parameter portion   : {100*ft_learnable/ft_all:.1f}%")

Number of model's parameters  : 355362819
Number of learnable parameters: 355362819
Learnable parameter portion   : 100.0%


In [17]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

## Tuning

In [18]:
training_args = TrainingArguments(
  output_dir="./",
  num_train_epochs=TRAINIG_EPOCH,
  learning_rate=1e-5,
  weight_decay=1e-2,
  per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
  per_device_eval_batch_size=32,
  evaluation_strategy="steps",
  eval_steps=(steps_per_epoch + VALIDATION_EACH_EPOCH - 1)//VALIDATION_EACH_EPOCH,
  logging_dir=".",
  save_steps=steps_per_epoch*100,
  save_total_limit=0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    data_collator=data_collator,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(axis=1))},
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
ft_start_time = time.time()

trainer.train()

ft_end_time = time.time()
ft_execution_time = ft_end_time - ft_start_time

Step,Training Loss,Validation Loss,Accuracy
1091,0.5594,0.498359,0.830534
2182,0.5036,0.390047,0.852672
3273,0.4719,0.457503,0.852672
4364,0.3996,0.541332,0.863359
5455,0.387,0.572432,0.861069
6546,0.4016,0.600433,0.864885
7637,0.2636,0.724192,0.863359
8728,0.3109,0.729709,0.863359
9819,0.2994,0.74944,0.861832
10910,0.1826,0.844445,0.864122


In [20]:
ft_results = trainer.evaluate(valid_tokenized)
print("FT Score:", ft_results)
print(f"FT Execution time: {ft_execution_time:.4f} (sec)")

FT Score: {'eval_loss': 1.28441321849823, 'eval_accuracy': 0.8709923664122138, 'eval_runtime': 12.4441, 'eval_samples_per_second': 105.271, 'eval_steps_per_second': 3.295, 'epoch': 10.0}
FT Execution time: 8938.0114 (sec)


In [21]:
del model, data_collator, training_args, trainer
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# LoRA Fine-Tuning

In [22]:
from peft import LoraConfig, TaskType, get_peft_model

In [23]:
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS).to(DEVICE)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
freeze_model(model)

In [25]:
lora_config = LoraConfig(
  r=8,
  lora_alpha=16,
  target_modules=["query", "value"],
  lora_dropout=0.05,
  bias="none",
  task_type=TaskType.SEQ_CLS,
)

print(lora_config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, inference_mode=False, r=8, target_modules={'value', 'query'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})


In [26]:
model = get_peft_model(model, lora_config)

In [27]:
lora_learnable = count_learnable_parameters(model)
lora_all = count_parameters(model)
print( "Number of model's parameters  :", lora_all)
print( "Number of learnable parameters:", lora_learnable)
print(f"Learnable parameter portion   : {100*lora_learnable/max(lora_all, 1):.1f}%")

Number of model's parameters  : 357201926
Number of learnable parameters: 1839107
Learnable parameter portion   : 0.5%


In [28]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): 

## Tuning

In [29]:
training_args = TrainingArguments(
  output_dir="./",
  num_train_epochs=TRAINIG_EPOCH,
  learning_rate=LEARNING_RATE,
  weight_decay=1e-2,
  per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
  per_device_eval_batch_size=32,
  evaluation_strategy="steps",
  eval_steps=(steps_per_epoch + VALIDATION_EACH_EPOCH - 1)//VALIDATION_EACH_EPOCH,
  logging_dir=".",
  save_steps=steps_per_epoch*100,
  save_total_limit=0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    data_collator=data_collator,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(axis=1))},
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
lora_start_time = time.time()

trainer.train()

lora_end_time = time.time()
lora_execution_time = lora_end_time - lora_start_time

Step,Training Loss,Validation Loss,Accuracy
1091,0.6014,0.438811,0.836641
2182,0.5108,0.424269,0.835878
3273,0.4621,0.405187,0.861832
4364,0.4238,0.426482,0.861832
5455,0.4029,0.446065,0.868702
6546,0.4071,0.446253,0.863359
7637,0.3495,0.470715,0.864885
8728,0.3678,0.389102,0.870229
9819,0.3711,0.401013,0.874046
10910,0.3146,0.47756,0.865649


In [31]:
lora_results = trainer.evaluate(valid_tokenized)
print("LoRA Score:", lora_results)
print(f"LoRA Execution time: {lora_execution_time:.4f} (sec)")

LoRA Score: {'eval_loss': 0.6790494918823242, 'eval_accuracy': 0.8801526717557252, 'eval_runtime': 12.6802, 'eval_samples_per_second': 103.311, 'eval_steps_per_second': 3.233, 'epoch': 10.0}
LoRA Execution time: 8055.4803 (sec)


In [32]:
del model, data_collator, training_args, trainer, lora_config
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Comparing

## Tuned Model's performance

In [33]:
ft_results   = ft_results['eval_accuracy']
lora_results = lora_results['eval_accuracy']

print(f"FT   Model  Accuracy: {ft_results:.4f}  ({100*ft_results/lora_results:.4f}% of LoRA)")
print(f"LoRA Model  Accuracy: {lora_results:.4f}  ({100*lora_results/ft_results:.4f}% of FT)")
print(f"LoRA vs. FT Accuracy: {lora_results-ft_results:.4f}  ({100*(lora_results-ft_results)/ft_results:.4f}%)")

FT   Model  Accuracy: 0.8710  (98.9592% of LoRA)
LoRA Model  Accuracy: 0.8802  (101.0517% of FT)
LoRA vs. FT Accuracy: 0.0092  (1.0517%)


## Tuning time taken

In [34]:
print(f"FT   Model  Time: {ft_execution_time:.4f}  ({100*ft_execution_time/lora_execution_time:.4f}% of LoRA)")
print(f"LoRA Model  Time: {lora_execution_time:.4f}  ({100*lora_execution_time/ft_execution_time:.4f}% of FT)")
print(f"LoRA vs. FT Time: {lora_execution_time-ft_execution_time:.4f}  ({100*(lora_execution_time-ft_execution_time)/ft_execution_time:.4f}%)")

FT   Model  Time: 8938.0114  (110.9557% of LoRA)
LoRA Model  Time: 8055.4803  (90.1261% of FT)
LoRA vs. FT Time: -882.5311  (-9.8739%)
