<a href="https://colab.research.google.com/github/LeograndeCode/LLM_Unlearning_SEMEval2025/blob/silvia-branch/Copia_di_LLM_Unlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initial Setup


In [None]:
import logging, sys
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    stream=sys.stdout
)
logger = logging.getLogger()



### Loading model and datasets


The dataset contains disjoint retain and forget splits in parquet files, and includes following fields: id, input, output, task.
* Subtask 1: Long form synthetic creative documents spanning different
genres.
* Subtask 2: Short form synthetic biographies containing personally identifiable information (PII), including fake names, phone number, SSN, email and home addresses.
* Subtask 3: Real documents sampled from the target model’s training dataset.

In [None]:
import pandas as pd
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata
#hf_token = userdata.get('HF_TOKEN')
hf_token = "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"
## Fetch and load model:
snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-1B-model-semeval25-unlearning', token=hf_token, local_dir='semeval25-unlearning-1B-model')
# model = AutoModelForCausalLM.from_pretrained('semeval25-unlearning-1B-model').to('cuda')

## Fetch and load dataset:
snapshot_download(repo_id='llmunlearningsemeval2025organization/semeval25-unlearning-dataset-public', token=hf_token, local_dir='semeval25-unlearning-data', repo_type="dataset")
retain_train_df = pd.read_parquet('semeval25-unlearning-data/data/retain_train-00000-of-00001.parquet', engine='pyarrow') # Retain split: train set
retain_validation_df = pd.read_parquet('semeval25-unlearning-data/data/retain_validation-00000-of-00001.parquet', engine='pyarrow') # Retain split: validation set
forget_train_df = pd.read_parquet('semeval25-unlearning-data/data/forget_train-00000-of-00001.parquet', engine='pyarrow') # Forget split: train set
forget_validation_df = pd.read_parquet('semeval25-unlearning-data/data/forget_validation-00000-of-00001.parquet', engine='pyarrow') # Forget split: validation set
!mkdir train validation
retain_train_df.to_json('train/retain.jsonl', orient='records', lines=True); forget_train_df.to_json('train/forget.jsonl', orient='records', lines=True)
retain_validation_df.to_json('validation/retain.jsonl', orient='records', lines=True); forget_validation_df.to_json('validation/forget.jsonl', orient='records', lines=True)


# ==== DEBUG: usa solo una porzione del dataset ====
# sample_size = 100  # numero di esempi per split
# retain_train_df     = retain_train_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
# forget_train_df     = forget_train_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
# retain_validation_df = retain_validation_df.sample(n=sample_size//10, random_state=42).reset_index(drop=True)
# forget_validation_df = forget_validation_df.sample(n=sample_size//10, random_state=42).reset_index(drop=True)
# ===================================================



# filter the data to include only one task (e.g., Task2)
forget_train_df = forget_train_df[forget_train_df["task"] == "Task2"]
retain_train_df = retain_train_df[retain_train_df["task"] == "Task2"]
forget_val_df = forget_validation_df[forget_validation_df["task"] == "Task2"]
retain_val_df = retain_validation_df[retain_validation_df["task"] == "Task2"]



Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/818 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

forget_validation-00000-of-00001.parquet:   0%|          | 0.00/54.2k [00:00<?, ?B/s]

forget_train-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

member.jsonl:   0%|          | 0.00/323k [00:00<?, ?B/s]

retain_validation-00000-of-00001.parquet:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

retain_train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

nonmember.jsonl:   0%|          | 0.00/194k [00:00<?, ?B/s]

requirements.txt:   0%|          | 0.00/211 [00:00<?, ?B/s]

evaluate_generations.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

# Create Dataloaders for Retain and Forget Set


In [None]:
#!pip install datasets
from datasets import Dataset

batch_size = 1

# 1. Crea HF Dataset
ds_retain = Dataset.from_pandas(retain_train_df)
ds_forget = Dataset.from_pandas(forget_train_df)
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")

# 2. Tokenizer function
def tokenize_fn(example):
    tokens = tokenizer(
        example["input"],
        text_target=example["output"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    return tokens

# 3. Applica
ds_retain = ds_retain.map(tokenize_fn, batched=True)
ds_forget = ds_forget.map(tokenize_fn, batched=True)

# 4. Crea DataLoader
from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch]),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch]),
        "labels": torch.tensor([x["labels"] for x in batch]),
        "start_locs": torch.tensor([x["start_locs"] for x in batch]),  # <- questa riga è fondamentale
    }


train_normal_loader = DataLoader(ds_retain, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
train_bad_loader    = DataLoader(ds_forget, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/642 [00:00<?, ? examples/s]

In [None]:
#pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

# 1) Configurazione 8-bit
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,               # carica in 8-bit
    llm_int8_threshold=6.0           # soglia consigliata
)

# 2) Carica tokenizer (non cambia)

# 3) Carica model e pretrained_model
model = AutoModelForCausalLM.from_pretrained(
    "semeval25-unlearning-1B-model",
    quantization_config=bnb_config,   # <-- 8-bit qui
    device_map="auto"
)

model.config.clip_qkv = None

pretrained_model = AutoModelForCausalLM.from_pretrained(
    "semeval25-unlearning-1B-model",
    device_map="auto"
)


# 4) Gradient checkpointing
model.gradient_checkpointing_enable()

# 5) Prepara per LoRA
model = prepare_model_for_kbit_training(model)
lora_cfg = LoraConfig(
    r=16,                             # rango LoRA
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    inference_mode=False,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
def tokenize_with_start(example):
    q, a = example["input"], example["output"]
    prefix = q
    full   = q + a

    # 1) tokenizza solo per contare i token reali (no pad)
    t_pref = tokenizer(prefix, truncation=True, padding=False)
    start_locs = len(t_pref["input_ids"])

    # 2) tokenizza la coppia vera e propria con pad/trunc
    t_full = tokenizer(full, truncation=True, padding="max_length", max_length=128)

    return {
      "input_ids":      t_full["input_ids"],
      "attention_mask": t_full["attention_mask"],
      "labels":         t_full["input_ids"],
      "start_locs":     start_locs,
    }


ds_retain = Dataset.from_pandas(retain_train_df).map(
    tokenize_with_start, batched=False, load_from_cache_file=False
)

ds_forget = Dataset.from_pandas(forget_train_df).map(
    tokenize_with_start, batched=False, load_from_cache_file=False
)

print(ds_retain[0].keys())
print(ds_forget[0].keys())


In [None]:
# import torch
# import torch.quantization as quant

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = AutoModelForCausalLM.from_pretrained(
#     "semeval25-unlearning-1B-model",
#     torch_dtype=torch.float16  # Use half precision
# ).to(device)
# pretrained_model = AutoModelForCausalLM.from_pretrained(
#     "semeval25-unlearning-1B-model",
#     torch_dtype=torch.float16  # Use half precision
# ).to(device)

# model.gradient_checkpointing_enable()

### Define Loss functions


In [None]:
#rimaste uguali

def compute_reverse_kl(pretrained_model, current_model, batch, device):
    """
    Compute *backward* KL as the normal utility loss.

    Args:
        pretrained_model: reference model which is the pretrained (original) model.
        current_model: The current unlearning model.
        batch: A batch of normal data.
        device: GPU device.

    Returns:
       The KL loss.
    """

    normal_outputs = current_model(
        batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device)
    )

    with torch.no_grad():
        pretrained_outputs = pretrained_model(
            batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device)
        )

    # Q: current model; P: pretrained model.
    prob_q = torch.nn.functional.softmax(normal_outputs.logits, dim=-1)
    prob_p = torch.nn.functional.softmax(pretrained_outputs.logits, dim=-1)

    # Negative KL divergence: sum(Q * log(Q/P))
    # loss = (prob_q * torch.log(prob_q / (prob_p + 1e-12))).sum(-1).mean()
    loss = - (prob_p * torch.log((prob_p + 1e-12) / prob_q)).sum(-1).mean()

    return loss

def get_answer_loss(operation, batch, model, device="cuda"):
    """
    Compute the loss on the answer (i.e. y) part.

    Args:
        operation: either "ga" (gradient ascent) or "gd" (gradient descent).
        batch: A batch of data.
        model: The unlearned model.
        device: GPU device.

    Returns:
       The loss.
    """
    assert operation in ["ga", "gd"], "Operation must be either GA or GD."
    input_ids, attention_mask, start_locs, labels = (
        batch["input_ids"].to(device),
        batch["attention_mask"].to(device),
        batch["start_locs"],
        batch["labels"].to(device),
    )
    outputs = model(input_ids, attention_mask=attention_mask)

    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    # Shift one to predict next token.
    shift_logits = outputs.logits[:, :-1, :]
    shift_labels = labels[:, 1:]
    losses = []
    for bid in range(input_ids.shape[0]):
        one_inp, one_st = input_ids[bid], start_locs[bid]

        # GA or GD.
        position_loss = loss_fct(shift_logits[bid], shift_labels[bid])

        if operation == "ga":  # Negative the direction for GA.
            position_loss = -position_loss

        # Simply put equal weights on all answers.
        position_weight = torch.zeros_like(one_inp)
        assert len(position_weight) == len(position_loss) + 1
        position_weight[one_st:] = 1  # only focus on answer part

        # Ignore the padding part.
        position_weight[one_inp == 1] = 0
        if position_weight.sum() > 0:
            position_weight = position_weight / position_weight.sum()

        one_loss = (position_weight[:-1] * position_loss).sum()
        losses.append(one_loss)

    final_loss = torch.stack(losses).mean()

    return final_loss



In [None]:
from transformers import DataCollatorForLanguageModeling
import random
import torch

def get_rand_ans_loss(bad_batch, tokenizer, normal_ans, model, K=5, device="cuda"):
    """
    Random Disassociation: per ogni domanda nel batch, campiona K answers dal retain set,
    crea batch di testi `Question + Answer`, e chiama get_answer_loss("gd", ...).
    """

    # 1) Decodifica le domande dal batch di input_ids
    #    skip_special_tokens=True per togliere pad/eos
    questions = tokenizer.batch_decode(
        bad_batch["input_ids"], skip_special_tokens=True
    )

    features = []
    for question in questions:
        prefix = question.strip()
        # 2) Conta i token reali del prefix (no pad)
        t_pref = tokenizer(prefix, truncation=True, padding=False)
        start_loc = len(t_pref["input_ids"])

        # 3) Per ogni question campiona K risposte casuali dal tuo retain set
        rand_samples = random.sample(normal_ans, K)
        for ans in rand_samples:
            text = prefix + ans
            tok  = tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=128
            )
            features.append({
                "input_ids":      tok["input_ids"],
                "attention_mask": tok["attention_mask"],
                "start_locs":     start_loc,
                "labels":         tok["input_ids"],
            })

    # 4) Usa lo stesso DataCollator del training
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    batch_random = data_collator(features)

    # 5) Loss di gradient *descent* sul segmento “answer”
    return get_answer_loss("gd", batch_random, model, device=device)


### Training

riproviamo qui:

In [None]:
from accelerate import Accelerator
from transformers import DataCollatorForLanguageModeling
from transformers import get_scheduler
from torch.optim import AdamW
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bad_weight = 2.5
random_weight = 2.5
normal_weight = 1
batch_size = 1
lr = 2e-4
max_unlearn_steps = 1000
model_save_dir = "semeval25-unlearning-model"
task_vector_saving_path = "semeval25-unlearning-model/task_vector"
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_unlearn_steps
)

retain_loader = DataLoader(ds_retain, batch_size, shuffle=True, collate_fn=collate_fn)
forget_loader = DataLoader(ds_forget, batch_size, shuffle=True, collate_fn=collate_fn)

bad_ans = retain_train_df["output"].tolist()
# Imposti quante iterazioni accumulare
accumulation_steps = 4

optimizer.zero_grad()
idx = 0
step = 0
while idx < max_unlearn_steps:
    for bad_batch, normal_batch in zip(forget_loader, retain_loader):
        # 1) Computa tutte le loss
        bad_loss    = get_answer_loss("gd", bad_batch,    model, device)
        random_loss = get_rand_ans_loss(bad_batch, tokenizer, bad_ans, model, device=device)
        normal_loss = compute_reverse_kl(pretrained_model, model, normal_batch, device)

        loss = (
            bad_weight    * bad_loss
          + random_weight * random_loss
          + normal_weight * normal_loss
        ) / accumulation_steps   # **dividi** la loss per il numero di accumuli

        accelerator.backward(loss)

        # 2) Ogni accumulation_steps passi fai optimizer.step()
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        idx += 1
        step += 1

        if idx % 100 == 0:
          print(f"GD_loss: {bad_loss}")
          print(f"RD_loss: {random_loss}")
          print(f"revKL_loss: {normal_loss}")

          print(f"[{idx}] loss_combined={(loss*accumulation_steps):.2f}")

        if idx >= max_unlearn_steps:
            break



# alla fine del loop di unlearning, se usi LoRA
model = model.merge_and_unload()



In [None]:
# model = model.merge_and_unload()
model.save_pretrained("tmp/unlearned_8bit", from_pt=True)

In [None]:
import torch
from transformers import AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_save_dir="semeval25-unlearning-model"
# 2) Ricarica da disco in FP32
model = AutoModelForCausalLM.from_pretrained(
    "tmp/unlearned_8bit",
    torch_dtype=torch.float32,
    device_map="auto"
)
model.save_pretrained(model_save_dir, from_pt=True)

pretrained_model = AutoModelForCausalLM.from_pretrained(
    "semeval25-unlearning-1B-model",
    torch_dtype=torch.float32
).to(device)

## Traskvector

In [None]:
import torch
class TaskVector():
    def __init__(self, pretrained_checkpoint=None, finetuned_checkpoint=None, vector=None):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

        This can either be done by passing two state dicts (one corresponding to the
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
        if vector is not None:
            self.vector = vector
        else:
            assert pretrained_checkpoint is not None and finetuned_checkpoint is not None
            with torch.no_grad():

                pretrained_state_dict = pretrained_checkpoint.state_dict()
                finetuned_state_dict = finetuned_checkpoint.state_dict()

                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
                        continue


    def __add__(self, other):
        """Add two task vectors together."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                if key not in other.vector:
                    print(f'Warning, key {key} is not present in both task vectors.')
                    continue
                new_vector[key] = self.vector[key] + other.vector[key]
        return TaskVector(vector=new_vector)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
            return self
        return self.__add__(other)

    def __neg__(self):
        """Negate a task vector."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = - self.vector[key]
        return TaskVector(vector=new_vector)

    def apply_to(self, pretrained_model, scaling_coef=1.0):
        """Apply a task vector to a pretrained model."""
        with torch.no_grad():
            new_state_dict = {}
            pretrained_state_dict = pretrained_model.state_dict()
            for key in pretrained_state_dict:
                if key not in self.vector:
                    print(f'Warning: key {key} is present in the pretrained state dict but not in the task vector')
                    continue
                new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
        pretrained_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model


    # You can uncomment the following version if you don't have enough GPU memory to apply the task vector in one go
    # Split and reassemble the task vector using multiple chunks

    # def apply_to(self, pretrained_model, scaling_coef=1.0, chunk_size=500):
    #     """Apply a task vector to a pretrained model in chunks."""
    #     with torch.no_grad():
    #         pretrained_state_dict = pretrained_model.state_dict()
    #         keys = list(self.vector.keys())  # Get all the parameter keys in the task vector
    #         total_keys = len(keys)
    #         for i in range(0, total_keys, chunk_size):
    #             new_state_dict = {}
    #             for key in keys[i:i + chunk_size]:
    #                 if key not in pretrained_state_dict:
    #                     print(f'Warning: key {key} is present in the task vector but not in the pretrained model')
    #                     continue
    #                 # Apply scaling and update the parameter
    #                 new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
    #
    #             # Partially load the updated state dict to the model
    #             pretrained_model.load_state_dict(new_state_dict, strict=False)
    #     return pretrained_model

In [None]:
# Task Vector
task_vector_saving_path = "semeval25-unlearning-model/task_vector"
task_vector= TaskVector(pretrained_model, model)
neg_task_vector = -task_vector
unlearned_model = neg_task_vector.apply_to(pretrained_model)
unlearned_model.save_pretrained(task_vector_saving_path, from_pt = True)


## Evaluation

In [None]:
import torch
from tqdm.auto import tqdm

def eval_loss(model, dataloader, device="cuda"):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction="sum")

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval"):
            input_ids = batch["input_ids"].to(device)
            attn      = batch["attention_mask"].to(device)
            labels    = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attn)
            # logits: [B, L, V]
            shift_logits = outputs.logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()

            # flatten
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                            shift_labels.view(-1))
            total_loss += loss.item()
            total_tokens += (shift_labels != tokenizer.pad_token_id).sum().item()

    avg_nll = total_loss / total_tokens
    ppl = torch.exp(torch.tensor(avg_nll))
    return avg_nll, ppl.item()


In [None]:
from datasets import Dataset

ds_retain_val = Dataset.from_pandas(retain_validation_df).map(
    tokenize_with_start,  # la stessa funzione che usi per train
    batched=False,
    load_from_cache_file=False
)
ds_forget_val = Dataset.from_pandas(forget_validation_df).map(
    tokenize_with_start,
    batched=False,
    load_from_cache_file=False
)

forget_val_loader = DataLoader(ds_forget_val, batch_size, shuffle=True, collate_fn=collate_fn)
retain_val_loader = DataLoader(ds_retain_val, batch_size, shuffle=True, collate_fn=collate_fn)



nll_forget_pre, ppl_forget_pre = eval_loss(pretrained_model, forget_val_loader)
nll_retain_pre, ppl_retain_pre = eval_loss(pretrained_model, retain_val_loader)

nll_forget_post, ppl_forget_post = eval_loss(unlearned_model, forget_val_loader)
nll_retain_post, ppl_retain_post = eval_loss(unlearned_model, retain_val_loader)



In [None]:
print(f"nll_forget_pre: {nll_forget_pre:.2f}")
print(f"ppl_forget_pre: {ppl_forget_pre:.2f}")
print(f"nll_forget_post: {nll_forget_post:.2f}")
print(f"ppl_forget_post: {ppl_forget_post:.2f}")

print(f"nll_retain_pre: {nll_retain_pre:.2f}")
print(f"ppl_retain_pre: {ppl_retain_pre:.2f}")
print(f"nll_retain_post: {nll_retain_post:.2f}")
print(f"ppl_retain_post: {ppl_retain_post:.2f}")

In [None]:
for index, example in forget_validation_df.sample(5).iterrows():
    prompt = example["input"]
    print("PROMPT:", prompt)
    out_pre  = pretrained_model.generate(tokenizer(prompt, return_tensors="pt").input_ids.to(device), max_new_tokens=50)
    out_post = unlearned_model.generate(tokenizer(prompt, return_tensors="pt").input_ids.to(device), max_new_tokens=50)
    print("ORIG:", tokenizer.decode(out_pre[0], skip_special_tokens=True))
    print("NEW:",  tokenizer.decode(out_post[0], skip_special_tokens=True))
    print("-"*40)