### Initial Setup


In [1]:
!pip install --upgrade transformers huggingface_hub; mkdir semeval25-unlearning-model; mkdir semeval25-unlearning-data


mkdir: cannot create directory ‘semeval25-unlearning-model’: File exists
mkdir: cannot create directory ‘semeval25-unlearning-data’: File exists


### Loading model and datasets


The dataset contains disjoint retain and forget splits in parquet files, and includes following fields: id, input, output, task.
* Subtask 1: Long form synthetic creative documents spanning different
genres.
* Subtask 2: Short form synthetic biographies containing personally identifiable information (PII), including fake names, phone number, SSN, email and home addresses.
* Subtask 3: Real documents sampled from the target model’s training dataset.

In [12]:
import pandas as pd
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

## Fetch and load model:
snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-1B-model-semeval25-unlearning', token=hf_token, local_dir='semeval25-unlearning-1B-model')
model = AutoModelForCausalLM.from_pretrained('semeval25-unlearning-1B-model').to('cuda')

## Fetch and load dataset:
snapshot_download(repo_id='llmunlearningsemeval2025organization/semeval25-unlearning-dataset-public', token=hf_token, local_dir='semeval25-unlearning-data', repo_type="dataset")
retain_train_df = pd.read_parquet('semeval25-unlearning-data/data/retain_train-00000-of-00001.parquet', engine='pyarrow') # Retain split: train set
retain_validation_df = pd.read_parquet('semeval25-unlearning-data/data/retain_validation-00000-of-00001.parquet', engine='pyarrow') # Retain split: validation set
forget_train_df = pd.read_parquet('semeval25-unlearning-data/data/forget_train-00000-of-00001.parquet', engine='pyarrow') # Forget split: train set
forget_validation_df = pd.read_parquet('semeval25-unlearning-data/data/forget_validation-00000-of-00001.parquet', engine='pyarrow') # Forget split: validation set
!mkdir train validation
retain_train_df.to_json('train/retain.jsonl', orient='records', lines=True); forget_train_df.to_json('train/forget.jsonl', orient='records', lines=True)
retain_validation_df.to_json('validation/retain.jsonl', orient='records', lines=True); forget_validation_df.to_json('validation/forget.jsonl', orient='records', lines=True)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

mkdir: cannot create directory ‘train’: File exists
mkdir: cannot create directory ‘validation’: File exists


In [3]:
print("Retain set structure")
retain_train_df.head()
print("Forget set structure")
forget_train_df.head()


Retain set structure
Forget set structure


Unnamed: 0,id,input,output,task,split
0,"""2ebbbb06-ab81-4bdf-af75-0157c7178a82""sc1","In the mystical city of Deadesius, where magic...",the power to break any curse. Armed with her m...,Task1,forget
1,"""2ebbbb06-ab81-4bdf-af75-0157c7178a82""qa0",Who did Catherina seek to protect from Marcile?,The city of Deadesius.,Task1,forget
2,67148749sc1,Soubhagya Kumar Misra\n\nSoubhagya Kumar Misra...,"Odia poetry, the Odisha Sahitya Akademi awarde...",Task3,forget
3,67148749qa0,Which poetry collection by Misra won the Sahit...,Dwa Suparna,Task3,forget
4,"""4477840f-1840-4aae-96d8-5389db92d7e0""sc1","Sharity, a vivacious young woman with an unque...","rugged, with a mess of dark hair and a pair of...",Task1,forget


# Create Dataloaders for Retain and Forget Set


In [4]:
!pip install datasets



In [5]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

def create_forget_dataloader(tokenizer, df, batch_size=64):
    """
    Create a dataloader for the forget set from a pandas DataFrame.

    Args:
        tokenizer: Tokenizer for the model
        df: pandas DataFrame with columns ['id', 'input', 'output']
        batch_size: Batch size for the dataloader

    Returns:
        DataLoader for the forget set
    """
    def preprocess(examples):
        results = {"input_ids": [], "attention_mask": [], "start_locs": []}

        for prompt, response in zip(examples["input"], examples["output"]):
            # Format the text with question and answer
            text = f"### Question: {prompt}\n ### Answer: {response}"
            tokenized = tokenizer(text, truncation=True, padding="max_length")

            results["input_ids"].append(tokenized["input_ids"])
            results["attention_mask"].append(tokenized["attention_mask"])

            # Calculate start location of answer
            question_prefix = f"### Question: {prompt}\n ### Answer: "
            tokenized_prefix = tokenizer(question_prefix, truncation=True, padding="max_length")
            results["start_locs"].append(len(tokenized_prefix["input_ids"]) - 1)

        return results

    # Convert DataFrame to HuggingFace Dataset
    dataset = Dataset.from_pandas(df)

    # Apply preprocessing
    dataset = dataset.map(
        preprocess,
        batched=True,
        remove_columns=["id", "input", "output"]
    )

    # Set format to PyTorch
    dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "start_locs"]
    )

    # Create dataloader
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        shuffle=True
    )

    return dataloader

def create_retain_dataloader(tokenizer, df, batch_size=64):
    """
    Create a dataloader for the retain set from a pandas DataFrame.

    Args:
        tokenizer: Tokenizer for the model
        df: pandas DataFrame with columns ['id', 'input', 'output']
        batch_size: Batch size for the dataloader

    Returns:
        DataLoader for the retain set
    """
    def preprocess(examples):
        results = {"input_ids": [], "attention_mask": []}

        for prompt, response in zip(examples["input"], examples["output"]):
            text = f"### Question: {prompt}\n ### Answer: {response}"
            tokenized = tokenizer(text, truncation=True, padding="max_length")

            results["input_ids"].append(tokenized["input_ids"])
            results["attention_mask"].append(tokenized["attention_mask"])

        return results

    # Convert DataFrame to HuggingFace Dataset
    dataset = Dataset.from_pandas(df)

    # Apply preprocessing
    dataset = dataset.map(
        preprocess,
        batched=True,
        remove_columns=["id", "input", "output"]
    )

    # Set format to PyTorch
    dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask"]
    )

    # Create dataloader
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        shuffle=True
    )

    return dataloader


### Define Loss functions


In [17]:
def compute_reverse_kl(pretrained_model, current_model, batch, device):
    """
    Compute *backward* KL as the normal utility loss.
    """
    # Move entire batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    normal_outputs = current_model(
        batch["input_ids"],
        attention_mask=batch["attention_mask"]
    )

    with torch.no_grad():
        pretrained_outputs = pretrained_model(
            batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    # Q: current model; P: pretrained model.
    prob_q = torch.nn.functional.softmax(normal_outputs.logits, dim=-1)
    prob_p = torch.nn.functional.softmax(pretrained_outputs.logits, dim=-1)

    # Negative KL divergence: sum(Q * log(Q/P))
    loss = - (prob_p * torch.log((prob_p + 1e-12) / prob_q)).sum(-1).mean()

    return loss

def get_answer_loss(operation, batch, model, device="cuda"):
    """
    Compute the loss on the answer (i.e. y) part.
    """
    # Move entire batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    start_locs = batch["start_locs"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)

    loss_fct = torch.nn.CrossEntropyLoss(reduction="none").to(device)
    # Shift one to predict next token.
    shift_logits = outputs.logits[:, :-1, :].to(device)
    shift_labels = labels[:, 1:].to(device)

    losses = []
    for bid in range(input_ids.shape[0]):
        one_inp = input_ids[bid]
        one_st = start_locs[bid]

        position_loss = loss_fct(shift_logits[bid], shift_labels[bid])

        if operation == "ga":
            position_loss = -position_loss

        position_weight = torch.zeros_like(one_inp).to(device)
        assert len(position_weight) == len(position_loss) + 1
        position_weight[one_st:] = 1

        position_weight[one_inp == 1] = 0  # Ignore padding
        if position_weight.sum() > 0:
            position_weight = position_weight / position_weight.sum()

        one_loss = (position_weight[:-1] * position_loss).sum()
        losses.append(one_loss)

    return torch.stack(losses).mean()

def get_rand_ans_loss(bad_batch, tokenizer, normal_ans, model, K=5, device="cuda"):
    """
    Compute the loss of the random mismatch.
    """
    # Move entire batch to device
    bad_batch = {k: v.to(device) for k, v in bad_batch.items()}

    bad_input_ids = bad_batch["input_ids"]
    rand_ans_list = random.sample(normal_ans, k=K)
    batch_random_features = []

    for batch_idx in range(bad_input_ids.shape[0]):
        single_input_id = bad_input_ids[batch_idx]
        ori_text = tokenizer.decode(single_input_id)

        # Extract question
        question = ori_text.split("###")[1].split("Question:")[-1].strip()
        question_prefix = f"### Question: {question}\n ### Answer: "

        # Tokenize on device
        tokenized_question_prefix = tokenizer(
            question_prefix,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        ).to(device)

        start_loc = tokenized_question_prefix.input_ids.shape[1]

        for rand_ans in rand_ans_list:
            random_sample = f"{question_prefix}{rand_ans}"
            tokenized_rs = tokenizer(
                random_sample,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)

            batch_random_features.append({
                "input_ids": tokenized_rs.input_ids.squeeze(0),
                "attention_mask": tokenized_rs.attention_mask.squeeze(0),
                "start_locs": torch.tensor([start_loc], device=device)
            })

    def get_harmful_responses(forget_train_df):
      """Extracts harmful responses from the forget training dataframe."""
      # Assuming your dataframe has a column called 'output' containing harmful responses
      return forget_train_df['output'].tolist()

    # Batchify on device
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    batch_random = data_collator(batch_random_features)
    batch_random = {k: v.to(device) for k, v in batch_random.items()}

    return get_answer_loss("gd", batch_random, model, device)


### Task Vector


In [None]:
import torch


class TaskVector():
    def __init__(self, pretrained_checkpoint=None, finetuned_checkpoint=None, vector=None):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

        This can either be done by passing two state dicts (one corresponding to the
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
        if vector is not None:
            self.vector = vector
        else:
            assert pretrained_checkpoint is not None and finetuned_checkpoint is not None
            with torch.no_grad():

                pretrained_state_dict = pretrained_checkpoint.state_dict()
                finetuned_state_dict = finetuned_checkpoint.state_dict()

                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
                        continue
                    self.vector[key] = finetuned_state_dict[key] - pretrained_state_dict[key]

    def __add__(self, other):
        """Add two task vectors together."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                if key not in other.vector:
                    print(f'Warning, key {key} is not present in both task vectors.')
                    continue
                new_vector[key] = self.vector[key] + other.vector[key]
        return TaskVector(vector=new_vector)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
            return self
        return self.__add__(other)

    def __neg__(self):
        """Negate a task vector."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = - self.vector[key]
        return TaskVector(vector=new_vector)

    def apply_to(self, pretrained_model, scaling_coef=1.0):
        """Apply a task vector to a pretrained model."""
        with torch.no_grad():
            new_state_dict = {}
            pretrained_state_dict = pretrained_model.state_dict()
            for key in pretrained_state_dict:
                if key not in self.vector:
                    print(f'Warning: key {key} is present in the pretrained state dict but not in the task vector')
                    continue
                new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
        pretrained_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model


    # You can uncomment the following version if you don't have enough GPU memory to apply the task vector in one go
    # Split and reassemble the task vector using multiple chunks

    # def apply_to(self, pretrained_model, scaling_coef=1.0, chunk_size=500):
    #     """Apply a task vector to a pretrained model in chunks."""
    #     with torch.no_grad():
    #         pretrained_state_dict = pretrained_model.state_dict()
    #         keys = list(self.vector.keys())  # Get all the parameter keys in the task vector
    #         total_keys = len(keys)
    #         for i in range(0, total_keys, chunk_size):
    #             new_state_dict = {}
    #             for key in keys[i:i + chunk_size]:
    #                 if key not in pretrained_state_dict:
    #                     print(f'Warning: key {key} is present in the task vector but not in the pretrained model')
    #                     continue
    #                 # Apply scaling and update the parameter
    #                 new_state_dict[key] = pretrained_state_dict[key] + scaling_coef * self.vector[key]
    #
    #             # Partially load the updated state dict to the model
    #             pretrained_model.load_state_dict(new_state_dict, strict=False)
    #     return pretrained_model

### Training

In [7]:
!mkdir semeval25-unlearning-model semeval25-unlearning-model/task_vector

mkdir: cannot create directory ‘semeval25-unlearning-model’: File exists
mkdir: cannot create directory ‘semeval25-unlearning-model/task_vector’: File exists


In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
from torch.optim import AdamW
import torch
import logging
from torch.utils.data import DataLoader
import random
import numpy as np

torch.manual_seed(8888)
np.random.seed(8888)
random.seed(8888)

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training parameters
num_training_steps = 1000
bad_weight = 2.5
random_weight = 1
normal_weight = 0.5
batch_size = 2
lr = 2e-4

model_save_dir = "semeval25-unlearning-model"
task_vector_saving_path = "semeval25-unlearning-model/task_vector"

# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")
pretrained_model = AutoModelForCausalLM.from_pretrained("semeval25-unlearning-1B-model").to(device)

# Create dataloaders (implement create_*_dataloader functions)
forget_train_dl = create_forget_dataloader(tokenizer, forget_train_df, batch_size=batch_size)
retain_train_dl = create_retain_dataloader(tokenizer, retain_train_df, batch_size=batch_size)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()

# Usage (add this before training loop)
bad_ans = get_harmful_responses(forget_train_df)


idx = 0
for _ in range(num_training_steps):
    for bad_batch, normal_batch in zip(forget_train_dl, retain_train_dl):
        # Move batches to device
        bad_batch = {k: v.to(device) for k, v in bad_batch.items()}
        normal_batch = {k: v.to(device) for k, v in normal_batch.items()}

        # Guided Distortion Module
        bad_loss = get_answer_loss("gd", bad_batch, model)

        # Random Disassociation Module
        random_loss = get_rand_ans_loss(bad_batch, tokenizer, bad_ans, model, K=5)

        # Preservation Divergence Module
        normal_loss = compute_reverse_kl(pretrained_model, model, normal_batch)

        # Total loss
        loss = (bad_weight * bad_loss +
               random_weight * random_loss +
               normal_weight * normal_loss)

        # Backpropagation
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Logging
        stats = (f"batch: {idx}, GD_loss: {bad_loss:.2f}, "
                f"RD_loss: {random_loss:.2f}, reversed_kl_loss: {normal_loss:.2f}, "
                f"combined_loss: {loss:.2f}")
        logging.info(stats)
        print(stats)
        idx += 1

# Save results
print("Saving model...")
model.save_pretrained(model_save_dir)
logging.info("Unlearning finished")

# Create and save task vector
task_vector = TaskVector(pretrained_model, model)
neg_task_vector = -task_vector
new_benign_model = neg_task_vector.apply_to(pretrained_model)
new_benign_model.save_pretrained(task_vector_saving_path)

print("Done saving task vector files!")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 394.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 6.12 MiB is free. Process 50871 has 14.73 GiB memory in use. Of the allocated memory 14.04 GiB is allocated by PyTorch, and 579.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)