In [1]:
!pip install datasets
!pip install evaluate
!pip install wandb

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
r"""
Training script to fine-tune a pre-train LLM with PEFT methods using HuggingFace.
  Example to run this conversion script:
    python peft_training.py \
     --in-file <path_to_hf_checkpoints_folder> \
     --out-file <path_to_output_nemo_file> \
"""

# ADOPTED FROM https://github.com/mehdiir/Roberta-Llama-Mistral/blob/main/training_script.py

import os
from datetime import datetime
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_TOKEN")
    os.environ["WANDB_API_KEY"] = user_secrets.get_secret("WANDB_API_KEY")
except ImportError: # Not using Kaggle
    try: # Try colab
        from google.colab import userdata
        os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')
    except ImportError:
        pass # Not using colab




    
# Wandb integration https://docs.wandb.ai/guides/integrations/huggingface/#next-level-logging-in-few-lines
os.environ["WANDB_PROJECT"] = "peft_bias_cs224n"  # log to your project
os.environ["WANDB_LOG_MODEL"] = "none"  # log your models
assert os.environ["WANDB_API_KEY"], "Set the environment variable 'WANDB_API_KEY'"

from copy import deepcopy
from functools import cache, partial
import multiprocessing as mp
from typing import Literal

from argparse import ArgumentParser
from datasets import load_from_disk, load_dataset, DatasetDict
import evaluate
import numpy as np
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, TrainerCallback
import torch

import wandb

In [3]:
def compute_metrics(eval_pred):
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="macro", zero_division=0.0)["precision"]  # type: ignore
    recall = recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"]  # type: ignore
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]  # type: ignore
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]  # type: ignore
    scores = {
        "precision": precision,
        "recall": recall,
        "f1-score": f1,
        "accuracy": accuracy,
    }
    return scores

In [4]:
class CustomCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
            )
            return control_copy

In [5]:
@cache
def get_dataset_and_collator(
    # data_path,
    model_checkpoints,
    max_length,
    max_samples: int | None,
    set_pad_id: bool,
    add_prefix_space=True,
    truncation=True,
):
    """
    Load the preprocessed HF dataset with train, valid and test objects

    Paramters:
    ---------
    data_path: str
        Path to the pre-processed HuggingFace dataset
    model_checkpoints:
        Name of the pre-trained model to use for tokenization
    """
    data: DatasetDict = load_dataset("siddharthmb/article-bias-prediction-random-splits").select_columns(["bias_text", "content"]).rename_columns({"bias_text": "labels", "content": "text"})  # type: ignore

    if max_samples:
        data["train"] = data["train"].select(
            range(min(max_samples, data["train"].num_rows))
        )

        num_test_samples = max_samples // 2
        num_test_samples = max(num_test_samples, 256) # At least 256, for good accuacy
        # Use less testing and validation samples
        data["test"] = data["test"].select(
            range(min(num_test_samples, data["test"].num_rows))
        )
        data["valid"] = data["valid"].select(
            range(min(num_test_samples, data["valid"].num_rows))
        )

    print("Loaded dataset. Size: ", data.shape)

    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoints, add_prefix_space=add_prefix_space
    )

    if set_pad_id:
        tokenizer.pad_token = tokenizer.eos_token

    def _preprocesscing_function(examples):
        return tokenizer(examples["text"], truncation=truncation, max_length=max_length)

    if max_length > tokenizer.model_max_length:
        raise ValueError(
            f"Supplied max_length {max_length} is greater than the model max length {tokenizer.model_max_length}."
        )

    tokenized_datasets = data.map(_preprocesscing_function, batched=True)
    tokenized_datasets.set_format("torch")  # type: ignore

    padding_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    return tokenized_datasets, padding_collator

In [6]:
def get_lora_model(
    model_checkpoints, num_labels, rank=4, alpha=16, lora_dropout=0.1, bias="none"
):
    """
    TODO
    """
    # if model_checkpoints == 'mistralai/Mistral-7B-v0.1' :
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_checkpoints,
        num_labels=num_labels,
        device_map="auto",
        offload_folder="offload",
        trust_remote_code=True,
    )
    model.tie_weights()
    if (model_checkpoints != "roberta-large"):
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=rank,
            lora_alpha=alpha,
            lora_dropout=lora_dropout,
            bias=bias,  # type: ignore
            target_modules=[
                "q_proj",
                "v_proj",
            ],
        )
    else:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=rank,
            lora_alpha=alpha,
            lora_dropout=lora_dropout,
            bias=bias,  # type: ignore
        )
    model = get_peft_model(model, peft_config)
    print(model.print_trainable_parameters())

    return model

In [7]:
def get_weighted_trainer(train_dataset):

    num_classes = train_dataset["labels"].max() + 1
    # Compute the class weights
    class_counts = train_dataset["labels"].bincount(minlength=num_classes)
    class_weights = class_counts.sum() / (num_classes * class_counts)

    # print(class_weights)
    # Convert to a tensor and move to the same device as the model

    class _WeightedBCELossTrainer(Trainer):
        def compute_loss(
            self, model, inputs, return_outputs=False, num_items_in_batch=None
        ):
            labels = inputs.pop("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits")
            # compute custom loss (suppose one has 3 labels with different weights)
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=class_weights.to(device=labels.device, dtype=logits.dtype)
            )  # weight: torch.tensor([neg_weight, pos_weight], device=labels.device, dtype=logits.dtype)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))  # type: ignore
            return (loss, outputs) if return_outputs else loss

    return _WeightedBCELossTrainer

In [8]:
def default_output_path():
    return os.path.join(
        "output", "bias-3", datetime.now().strftime("%Y-%m-%d-%H.%M.%S")
    )

In [9]:
def train_with_hyperparams(
    model_name: Literal[
        "roberta-large",
        "mistralai/Mistral-7B-v0.1",
        "meta-llama/Llama-2-7b-hf",
        "meta-llama/Llama-3.2-3B",
    ] = "roberta-large",
    max_samples=64,
    max_length=512,
    epochs=2,
    dropout=0.2,
    learning_rate=1e-3,
    weight_decay=0.1,
    lora_rank=4,
    lora_alpha=16,
    bias_layers: Literal["lora_only", "none", "all"] = "lora_only",
    batch_size=128,
    output_path=default_output_path(),
):
    """
    Training function
    """

    NUM_LABELS = 3

    set_pad_id = (
        model_name == "mistralai/Mistral-7B-v0.1"
        or model_name == "meta-llama/Llama-2-7b-hf"
        or model_name == "meta-llama/Llama-3.2-3B"
    )

    torch.cuda.empty_cache()

    dataset, collator = get_dataset_and_collator(
        # args.data_path,
        model_name,
        max_length=max_length,
        max_samples=max_samples,
        set_pad_id=set_pad_id,
        add_prefix_space=True,
        truncation=True,
    )

    training_args = TrainingArguments(
        output_dir=output_path,
        learning_rate=learning_rate,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        gradient_checkpointing=True,
        fp16=True,
        report_to="wandb",
        logging_steps=1,
        max_grad_norm=0.3,
        remove_unused_columns=True
    )

    model = get_lora_model(
        model_name,
        num_labels=NUM_LABELS,
        rank=lora_rank,
        alpha=lora_alpha,
        lora_dropout=dropout,
        bias=bias_layers,
    )

    if set_pad_id:
        model.config.pad_token_id = model.config.eos_token_id  # type: ignore
    
    if model.device.type != "cuda":
        model = model.to("cuda")

    weighted_trainer = get_weighted_trainer(dataset["train"])

    trainer = weighted_trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],  # type: ignore
        eval_dataset=dataset["valid"],  # type: ignore
        data_collator=collator,
        compute_metrics=compute_metrics,
    )
    trainer.add_callback(CustomCallback(trainer))
    print("Training...", model.device)
    # torch.cuda.empty_cache()
    trainer.train()

    save_path = os.path.join(output_path, "trained_model")
    print("Saving model to: ", save_path)
    trainer.save_model(save_path)

In [10]:
def train():

    # Initialize a new wandb run
    with wandb.init(config=None):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        train_with_hyperparams(
            model_name=config.model_name,
            max_samples=config.max_samples,
            max_length=config.max_length,
            epochs=config.epochs,
            dropout=config.dropout,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            lora_rank=config.lora_rank,
            lora_alpha=config.lora_alpha,
            bias_layers=config.bias_layers,
            batch_size=config.batch_size,
            output_path=default_output_path(),
        )

In [11]:
# Wandb sweep
def config_sweep():
    sweep_config = {
        "method": "random",
        "metric": {"goal": "minimize", "name": "eval/loss"},
        "parameters": {
            "model_name": {
                
                "value": "facebook/opt-125m" # roberta-large
            },
            "batch_size": {
                "value": 128 # 600
                # "distribution": "q_log_uniform_values",
                # "max": 512,
                # "min": 32,
                # "q": 8,
            },
            "dropout": {
                "value": 0.2
                # "values": [0.1, 0.2, 0.3, 0.4, 0.5]
            },
            "epochs": {
                # "distribution": "int_uniform",
                # "min": 1,
                # "max": 6
                # "values": [1,2,4,6,10],
                "value": 2
            },
            "learning_rate": {
                  # "distribution": "uniform",
                  # "max": 0.1,
                  # "min": 0
                "value": 1e-3
            },
            "lora_alpha": {
                # "values": [0, 16]
                # "value": 16
                "value": 16
                },
            "lora_rank": {
                "value": 4
            },
            "bias_layers": {
                # "values": [ "none", "lora_only", "all"]
                "value": "lora_only",
                # "values": [ "lora_only", "all"]
                },
            "weight_decay": {
                "value": 0.1
            },
            "max_samples": {
                # "values": [2048,4096,8192],
                "value": 32768
                # "distribution": "q_log_uniform_values",
                # "max": 4096,
                # "min": 64,
                # "q": 8,
            },
            "max_length": {"value": 2048},
        },
    }

    wandb_project_name = os.environ["WANDB_PROJECT"]
    sweep_id = wandb.sweep(sweep_config, project=wandb_project_name)

    return sweep_id

In [12]:
def run_sweep_agent(sweep_id: str, runs=1):
  wandb.agent(sweep_id, function=train, count=runs)

In [13]:
def run_sweep_parallel(sweep_id: str, num_agents: int = 1,runs_per_agent=1):
  num_agents = num_agents if num_agents else min(mp.cpu_count(), 72)  # Adjust this number based on your system

  os.environ["WANDB_RUN_GROUP"] = "experiment-" + wandb.util.generate_id()
  print("Running multiple processes in group: ", os.environ["WANDB_RUN_GROUP"])


  wandb.setup()
  if num_agents == 1:
    print("1 agent, not parallelizing")
    run_sweep_agent(sweep_id = sweep_id, runs=runs_per_agent)
    return

  with mp.Pool(num_agents) as pool:
    f = partial(run_sweep_agent, sweep_id, runs=run_per_agent)
    pool.map(f, range(num_agents))
    pool.close()
    print("Finished pool setup, waiting for finish...", flush = True)
    pool.join()
    print("Finished")


In [14]:
def main():
  sweep_id = config_sweep()
  run_sweep_parallel(sweep_id = sweep_id, num_agents=1, runs_per_agent=1)


In [15]:
if __name__ == "__main__":
    try:
      mp.set_start_method("spawn", force=True)
    except RuntimeError:
      print("Start method already set")
      pass  # Context was already set, ignore the error
    main()
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: qwrtmq79
Sweep URL: https://wandb.ai/siddharth-stanford/peft_bias_cs224n/sweeps/qwrtmq79
Running multiple processes in group:  experiment-mhh46nv4
1 agent, not parallelizing


[34m[1mwandb[0m: Agent Starting Run: 8yle04re with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	bias_layers: lora_only
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_rank: 4
[34m[1mwandb[0m: 	max_length: 2048
[34m[1mwandb[0m: 	max_samples: 32768
[34m[1mwandb[0m: 	model_name: facebook/opt-125m
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: Currently logged in as: [33msidmb[0m ([33msiddharth-stanford[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250308_233404-8yle04re[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfresh-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/siddhart

README.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

valid-00000-of-00001.parquet:   0%|          | 0.00/50.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27978 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1300 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/6996 [00:00<?, ? examples/s]

Loaded dataset. Size:  {'train': (27978, 2), 'test': (1300, 2), 'valid': (6996, 2)}


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/27978 [00:00<?, ? examples/s]

Map:   0%|          | 0/1300 [00:00<?, ? examples/s]

Map:   0%|          | 0/6996 [00:00<?, ? examples/s]



pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Received unrecognized `WANDB_LOG_MODEL` setting value=none; so disabling `WANDB_LOG_MODEL`


trainable params: 168,192 || all params: 125,391,360 || trainable%: 0.1341
None
Training... cuda:0


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


Traceback (most recent call last):
  File "<ipython-input-10-a87c44a9b827>", line 9, in train
    train_with_hyperparams(
  File "<ipython-input-9-5a2d9f198fa2>", line 92, in train_with_hyperparams
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2164, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2615, in _inner_training_loop
    self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.py", line 479, in on_epoch_end
    return self.call_event("on_epoch_end", args, state, control)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.py", line 519, in call_event
    result = getattr(callback, event)(
  File "<ipython-input-4-8135deefd24d>", line 9, in on_epoch_end
    self._trainer.evaluate(
  File "/usr/local/lib/python3.10/dist-packages/