<a href="https://colab.research.google.com/github/LeograndeCode/LLM_Unlearning_SEMEval2025/blob/augusto-branch/llm_unlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initial Setup


In [1]:
!pip install --upgrade transformers huggingface_hub; mkdir semeval25-unlearning-model; mkdir semeval25-unlearning-data


Collecting huggingface_hub
  Downloading huggingface_hub-0.32.6-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.32.6-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.8/512.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.32.4
    Uninstalling huggingface-hub-0.32.4:
      Successfully uninstalled huggingface-hub-0.32.4
Successfully installed huggingface_hub-0.32.6


### Loading model and datasets


The dataset contains disjoint retain and forget splits in parquet files, and includes following fields: id, input, output, task.
* Subtask 1: Long form synthetic creative documents spanning different
genres.
* Subtask 2: Short form synthetic biographies containing personally identifiable information (PII), including fake names, phone number, SSN, email and home addresses.
* Subtask 3: Real documents sampled from the target model’s training dataset.

In [4]:
import pandas as pd
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata
import torch

hf_token = userdata.get('HF_TOKEN')

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Fetch and load model:
snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-1B-model-semeval25-unlearning', token=hf_token, local_dir='semeval25-unlearning-1B-model')
model = AutoModelForCausalLM.from_pretrained('semeval25-unlearning-1B-model').to(device)
# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")
pretrained_model = AutoModelForCausalLM.from_pretrained("semeval25-unlearning-1B-model").to(device)

## Fetch and load dataset:
snapshot_download(repo_id='llmunlearningsemeval2025organization/semeval25-unlearning-dataset-public', token=hf_token, local_dir='semeval25-unlearning-data', repo_type="dataset")
retain_train_df = pd.read_parquet('semeval25-unlearning-data/data/retain_train-00000-of-00001.parquet', engine='pyarrow') # Retain split: train set
retain_validation_df = pd.read_parquet('semeval25-unlearning-data/data/retain_validation-00000-of-00001.parquet', engine='pyarrow') # Retain split: validation set
forget_train_df = pd.read_parquet('semeval25-unlearning-data/data/forget_train-00000-of-00001.parquet', engine='pyarrow') # Forget split: train set
forget_validation_df = pd.read_parquet('semeval25-unlearning-data/data/forget_validation-00000-of-00001.parquet', engine='pyarrow') # Forget split: validation set
!mkdir train validation
retain_train_df.to_json('train/retain.jsonl', orient='records', lines=True); forget_train_df.to_json('train/forget.jsonl', orient='records', lines=True)
retain_validation_df.to_json('validation/retain.jsonl', orient='records', lines=True); forget_validation_df.to_json('validation/forget.jsonl', orient='records', lines=True)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/818 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

forget_validation-00000-of-00001.parquet:   0%|          | 0.00/54.2k [00:00<?, ?B/s]

evaluate_generations.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

forget_train-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

member.jsonl:   0%|          | 0.00/323k [00:00<?, ?B/s]

retain_train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

retain_validation-00000-of-00001.parquet:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

nonmember.jsonl:   0%|          | 0.00/194k [00:00<?, ?B/s]

requirements.txt:   0%|          | 0.00/211 [00:00<?, ?B/s]

In [5]:
#print("Retain set structure")
#retain_train_df.head(15)
print("Forget set structure")
forget_train_df.head(30)


Forget set structure


Unnamed: 0,id,input,output,task,split
0,"""2ebbbb06-ab81-4bdf-af75-0157c7178a82""sc1","In the mystical city of Deadesius, where magic...",the power to break any curse. Armed with her m...,Task1,forget
1,"""2ebbbb06-ab81-4bdf-af75-0157c7178a82""qa0",Who did Catherina seek to protect from Marcile?,The city of Deadesius.,Task1,forget
2,67148749sc1,Soubhagya Kumar Misra\n\nSoubhagya Kumar Misra...,"Odia poetry, the Odisha Sahitya Akademi awarde...",Task3,forget
3,67148749qa0,Which poetry collection by Misra won the Sahit...,Dwa Suparna,Task3,forget
4,"""4477840f-1840-4aae-96d8-5389db92d7e0""sc1","Sharity, a vivacious young woman with an unque...","rugged, with a mess of dark hair and a pair of...",Task1,forget
5,"""4477840f-1840-4aae-96d8-5389db92d7e0""qa0",Who did Sharity felt an immediate connection t...,A tall and rugged man with piercing blue eyes.,Task1,forget
6,"""1ec14216-796c-4242-ab58-4ea066e95cc7""sc1","East Longmeadow, a vibrant city known for its ...","with anticipation. Melania, elegantly dressed,...",Task1,forget
7,"""1ec14216-796c-4242-ab58-4ea066e95cc7""qa0",Who offered Carolynn the opportunity to showca...,Melania,Task1,forget
8,72180302sc1,Xu Xisheng\n\nXu Xisheng (; born April 1964) i...,deputy political commissar of the Southern The...,Task3,forget
9,72180302qa0,When did Xu Xisheng become deputy political co...,July 2017,Task3,forget


# Create Dataloaders for Retain and Forget Set


In [6]:
!pip install datasets



### Dataloader


In [7]:
import random

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from transformers import DataCollatorForLanguageModeling
import json
from torch.utils.data import DataLoader, random_split, TensorDataset

torch.manual_seed(8888)
np.random.seed(8888)
random.seed(8888)

def create_retain_dataloader(tokenizer, dataset, batch_size=64):
  questions = dataset["input"].values
  answers = dataset["output"].values
  data = {"input_ids": [], "attention_mask": []}
  for question, answer in zip(questions, answers):
    text = f"### Question: {question}\n ### Answer: {answer}"
    tokenized = tokenizer(text, truncation=True, padding="max_length")
    data["input_ids"].append(tokenized["input_ids"])
    data["attention_mask"].append(tokenized["attention_mask"])
  dataset = Dataset.from_dict(data)
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

  retain_dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=True
  )
  return retain_dataloader

def create_forget_dataloader(tokenizer, dataset, batch_size=64):

  questions = dataset["input"].values
  answers = dataset["output"].values
  # It's important to define start_locs now since we need a way to define where does the answer start in the string
  data = {"input_ids": [], "attention_mask": [], "start_locs": [] }

  for question, answer in zip(questions, answers):
    text = f"### Question: {question}\n ### Answer: {answer}"
    tokenized = tokenizer(text, truncation=True, padding="max_length")
    data["input_ids"].append(tokenized["input_ids"])
    data["attention_mask"].append(tokenized["attention_mask"])

    # Calculate start idx for answer
    test_text = f"### Question: {question}\n ### Answer: "
    test_tokenized = tokenizer(
        test_text, truncation=True, padding="max_length"
    )
    data["start_locs"].append(len(test_tokenized["input_ids"]) - 1)
    dataset = Dataset.from_dict(data)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    forget_dataloader = torch.utils.data.DataLoader(
          dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=True
    )
    return forget_dataloader



def get_harmful_responses(forget_set):
    """
    Extract harmful responses from a dataset split.

    Args:
        dataset_split: A split of the dataset (train or test).

    Returns:
        List of all harmful responses in the dataset split.
    """
    list_harmful_responses = list(forget_set["output"].values)
    return list(list_harmful_responses)

# Creation of dataloader
retain_train_dataloader = create_retain_dataloader(tokenizer, retain_train_df, batch_size=64)

forget_train_dataloader = create_forget_dataloader(tokenizer, forget_train_df, batch_size=64)
# Test dataloader
for retain_batch, forget_batch in zip(retain_train_dataloader, forget_train_dataloader):
  decoded_batch_retain = tokenizer.batch_decode(retain_batch["input_ids"], skip_special_tokens=True)
  decoded_batch_forget = tokenizer.batch_decode(forget_batch["input_ids"], skip_special_tokens=True)
  print('retain example')
  print(decoded_batch_retain)
  print('forget example')
  print(decoded_batch_forget)
  break

list_harmful_responses = get_harmful_responses(forget_train_df)
print(list_harmful_responses[2])


Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


retain example
['### Question: Rory McConnell\n\nRory McConnell is a DJ and radio presenter for BBC Radio 1. He previously presented the Northern Ireland edition of "BBC Introducing", a regionalised radio programme transmitted Belfast from midnight to 2am on Sunday night/Monday morning.\nBiography.\nMcConnell is from Belfast. At 16, he joined the BBC Radio Ulster show, Across The Line, answering the phones. He later went on to co-present the show. In 2004 he moved to Radio 1 and then in February 2006 he began presenting Radio 1\'s Northern Ireland Show from 8 to 10pm on Thursday nights. He currently presents and produces the Northern Ireland edition of the "BBC Introducing" show.\nRory\n ### Answer: currently runs the promotions, management and record label Di Di Mau. In the past has staged concerts in Belfast and has released the debut album from Dutch Schultz through Di Di Mau Records.', '### Question: Domenico Mancini\n\nDomenico Mancini (born late 15th century, died in 16th century

### Loss Functions


In [8]:



def compute_reverse_kl(pretrained_model, current_model, batch, device):
    """
    Compute *backward* KL as the normal utility loss.

    Args:
        pretrained_model: reference model which is the pretrained (original) model.
        current_model: The current unlearning model.
        batch: A batch of normal data.
        device: GPU device.

    Returns:
       The KL loss.
    """

    normal_outputs = current_model(
        batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device)
    )

    with torch.no_grad():
        pretrained_outputs = pretrained_model(
            batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device)
        )

    # Q: current model; P: pretrained model.
    prob_q = torch.nn.functional.softmax(normal_outputs.logits, dim=-1)
    prob_p = torch.nn.functional.softmax(pretrained_outputs.logits, dim=-1)

    # Negative KL divergence: sum(Q * log(Q/P))
    # loss = (prob_q * torch.log(prob_q / (prob_p + 1e-12))).sum(-1).mean()
    loss = - (prob_p * torch.log((prob_p + 1e-12) / prob_q)).sum(-1).mean()

    return loss

def get_answer_loss(operation, batch, model, device="cuda"):
    """
    Compute the loss on the answer (i.e. y) part.

    Args:
        operation: either "ga" (gradient ascent) or "gd" (gradient descent).
        batch: A batch of data.
        model: The unlearned model.
        device: GPU device.

    Returns:
       The loss.
    """
    assert operation in ["ga", "gd"], "Operation must be either GA or GD."
    input_ids, attention_mask, start_locs, labels = (
        batch["input_ids"].to(device),
        batch["attention_mask"].to(device),
        batch["start_locs"],
        batch["labels"].to(device),
    )
    outputs = model(input_ids, attention_mask=attention_mask)

    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    # Shift one to predict next token.
    shift_logits = outputs.logits[:, :-1, :]
    shift_labels = labels[:, 1:]
    losses = []
    for bid in range(input_ids.shape[0]):
        one_inp, one_st = input_ids[bid], start_locs[bid]

        # GA or GD.
        position_loss = loss_fct(shift_logits[bid], shift_labels[bid])

        if operation == "ga":  # Negative the direction for GA.
            position_loss = -position_loss

        # Simply put equal weights on all answers.
        position_weight = torch.zeros_like(one_inp)
        assert len(position_weight) == len(position_loss) + 1
        position_weight[one_st:] = 1  # only focus on answer part

        # Ignore the padding part.
        position_weight[one_inp == 1] = 0
        if position_weight.sum() > 0:
            position_weight = position_weight / position_weight.sum()

        one_loss = (position_weight[:-1] * position_loss).sum()
        losses.append(one_loss)

    final_loss = torch.stack(losses).mean()

    return final_loss


def get_rand_ans_loss(bad_batch, tokenizer, normal_ans, model, K=5, device="cuda:0"):
    """
    Compute the loss of the random mismatch.

    Args:
        bad_batch: A batch of forgetting data.
        tokenizer: The tokenizer.
        normal_ans: A list of random answers.
        model: unlearned model.
        K: How many random answers sampled for each forgetting sample.
        device: GPU device.

    Returns:
       The random mismatch loss.
    """
    bad_input_ids = bad_batch["input_ids"].to(device)
    rand_ans_list = random.sample(normal_ans, k=K)
    batch_random_features = []
    for batch_idx in range(bad_input_ids.shape[0]):
        single_input_id = bad_input_ids[batch_idx, :]
        ori_text = tokenizer.decode(single_input_id)
        # Get question.
        question = ori_text.split("###")[1].split("Question:")[-1].strip()
        question_prefix = f"### Question: {question}\n ### Answer: "
        tokenized_question_prefix = tokenizer(
            question_prefix, truncation=True, padding="max_length"
        )
        # Doesn't need to minus 1 because there's a starting token in the beginning.
        start_loc = len(tokenized_question_prefix)

        # Get random answer.
        for rand_ans in rand_ans_list:
            random_sample = f"{question_prefix}{rand_ans}"

            # Tokenize.
            tokenized_rs = tokenizer(
                random_sample, truncation=True, padding="max_length"
            )
            batch_random_features.append(
                {
                    "input_ids": tokenized_rs["input_ids"],
                    "attention_mask": tokenized_rs["attention_mask"],
                    "start_locs": start_loc,
                }
            )

    # Batchify.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    batch_random = data_collator(batch_random_features)

    # GD on answer.
    random_loss = get_answer_loss("gd", batch_random, model, device=device)

    return random_loss



### Task Vector


In [9]:
import torch


class TaskVector():
    def __init__(self, pretrained_checkpoint=None, finetuned_checkpoint=None, vector=None):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

        This can either be done by passing two state dicts (one corresponding to the
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
        if vector is not None:
            self.vector = vector
        else:
            assert pretrained_checkpoint is not None and finetuned_checkpoint is not None
            with torch.no_grad():

                pretrained_state_dict = pretrained_checkpoint.state_dict()
                finetuned_state_dict = finetuned_checkpoint.state_dict()

                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
                        continue
                    self.vector[key] = finetuned_state_dict[key] - pretrained_state_dict[key]

    def __add__(self, other):
        """Add two task vectors together."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                if key not in other.vector:
                    print(f'Warning, key {key} is not present in both task vectors.')
                    continue
                new_vector[key] = self.vector[key] + other.vector[key]
        return TaskVector(vector=new_vector)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
            return self
        return self.__add__(other)

    def __neg__(self):
        """Negate a task vector."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = - self.vector[key]
        return TaskVector(vector=new_vector)

    def apply_to(self, pretrained_model, scaling_coef=1.0):
        """Apply a task vector to a pretrained model."""
        with torch.no_grad():
            new_state_dict = {}
            pretrained_state_dict = pretrained_model.state_dict()
            for key in pretrained_state_dict:
                if key not in self.vector:
                    print(f'Warning: key {key} is present in the pretrained state dict but not in the task vector')
                    continue
                new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
        pretrained_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model


    # You can uncomment the following version if you don't have enough GPU memory to apply the task vector in one go
    # Split and reassemble the task vector using multiple chunks

    # def apply_to(self, pretrained_model, scaling_coef=1.0, chunk_size=500):
    #     """Apply a task vector to a pretrained model in chunks."""
    #     with torch.no_grad():
    #         pretrained_state_dict = pretrained_model.state_dict()
    #         keys = list(self.vector.keys())  # Get all the parameter keys in the task vector
    #         total_keys = len(keys)
    #         for i in range(0, total_keys, chunk_size):
    #             new_state_dict = {}
    #             for key in keys[i:i + chunk_size]:
    #                 if key not in pretrained_state_dict:
    #                     print(f'Warning: key {key} is present in the task vector but not in the pretrained model')
    #                     continue
    #                 # Apply scaling and update the parameter
    #                 new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
    #
    #             # Partially load the updated state dict to the model
    #             pretrained_model.load_state_dict(new_state_dict, strict=False)
    #     return pretrained_model

### Training

In [10]:
!mkdir semeval25-unlearning-model semeval25-unlearning-model/task_vector

mkdir: cannot create directory ‘semeval25-unlearning-model’: File exists


In [13]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cpu')

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
from torch.optim import AdamW
import torch
import logging
from torch.utils.data import DataLoader
import random
import numpy as np

torch.manual_seed(8888)
np.random.seed(8888)
random.seed(8888)



# Training parameters
num_training_steps = 1000
bad_weight = 2.5
random_weight = 1
normal_weight = 0.5
batch_size = 2
lr = 2e-4

model_save_dir = "semeval25-unlearning-model"
task_vector_saving_path = "semeval25-unlearning-model/task_vector"


# Create dataloaders (implement create_*_dataloader functions)
forget_train_dl = create_forget_dataloader(tokenizer, forget_train_df, batch_size=batch_size)
retain_train_dl = create_retain_dataloader(tokenizer, retain_train_df, batch_size=batch_size)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()

# Usage (add this before training loop)
bad_ans = get_harmful_responses(forget_train_df)


idx = 0
for _ in range(num_training_steps):
    for bad_batch, normal_batch in zip(forget_train_dl, retain_train_dl):
        # Move batches to device
        bad_batch = {k: v.to(device) for k, v in bad_batch.items()}
        normal_batch = {k: v.to(device) for k, v in normal_batch.items()}

        # Guided Distortion Module
        bad_loss = get_answer_loss("gd", bad_batch, model)

        # Random Disassociation Module
        random_loss = get_rand_ans_loss(bad_batch, tokenizer, bad_ans, model, K=5)

        # Preservation Divergence Module
        normal_loss = compute_reverse_kl(pretrained_model, model, normal_batch)

        # Total loss
        loss = (bad_weight * bad_loss +
               random_weight * random_loss +
               normal_weight * normal_loss)

        # Backpropagation
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Logging
        stats = (f"batch: {idx}, GD_loss: {bad_loss:.2f}, "
                f"RD_loss: {random_loss:.2f}, reversed_kl_loss: {normal_loss:.2f}, "
                f"combined_loss: {loss:.2f}")
        logging.info(stats)
        print(stats)
        idx += 1

# Save results
print("Saving model...")
model.save_pretrained(model_save_dir)
logging.info("Unlearning finished")

# Create and save task vector
task_vector = TaskVector(pretrained_model, model)
neg_task_vector = -task_vector
new_benign_model = neg_task_vector.apply_to(pretrained_model)
new_benign_model.save_pretrained(task_vector_saving_path)

print("Done saving task vector files!")



RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx