In [1]:
import pandas as pd
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer

from utils import *

path = "/data1/malto/unlearning_llm/"
hf_token = load_token()   # Copy token here

## Fetch and load model:
model_path = path + 'models/semeval25-unlearning-model-1B-model'
#snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-finetuned-semeval25-unlearning', token=hf_token, local_dir=model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf") #allenai/OLMo-7B-0724-Instruct-hf

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.35it/s]


In [73]:
len(tokenizer)

50280

In [5]:
## Fetch and load dataset:
dataset_path = path + 'datasets/semeval25-unlearning-data/'

retain_train_df = pd.read_parquet(dataset_path+'data/retain_train-00000-of-00001.parquet', engine='pyarrow') # Retain split: train set
retain_validation_df = pd.read_parquet(dataset_path+'data/retain_validation-00000-of-00001.parquet', engine='pyarrow') # Retain split: validation set
forget_train_df = pd.read_parquet(dataset_path+'data/forget_train-00000-of-00001.parquet', engine='pyarrow') # Forget split: train set
forget_validation_df = pd.read_parquet(dataset_path+'data/forget_validation-00000-of-00001.parquet', engine='pyarrow') # Forget split: validation set

In [8]:
original_model = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-0724-hf")
original_tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-0724-hf")

Downloading shards: 100%|██████████| 2/2 [02:01<00:00, 61.00s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.03it/s]


In [99]:
sentence = retain_train_df.iloc[0]['input']
output = retain_train_df.iloc[0]['output']
print(sentence)
print(output)

Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone
number is 889-867-1855. She can be reached at the email address [fredericka\_amber@me.com](mailto:fredericka_amber@me.com). Her home address is 5611 North 61st Avenue, Louisville, KY, 40258.


In [None]:
# generate text 

def generate_text(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_next_token(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    dim = input_ids.shape[1]
    print(model(input_ids).logits.shape)
    output = model.generate(input_ids, max_length=dim+1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

test = generate_next_token(model, tokenizer, "a new study shows that")
test

torch.Size([1, 5, 50304])


'a new study shows that vitamin'

In [97]:
input_ids = tokenizer.encode("a new test case has been added", return_tensors='pt')
print(input_ids.shape)
logits = model(input_ids).logits
print(logits.shape)
tokenizer.decode(logits.argmax(dim=-1)[0], skip_special_tokens=True)

torch.Size([1, 7])
torch.Size([1, 7, 50304])


'What party case has been added to'

In [108]:
# comparison method 

ground_truth = generate_text(model, tokenizer, sentence, max_length=100)
len_token_input = len(tokenizer.encode(sentence, return_tensors='pt')[0])
len_token_output = len(tokenizer.encode(ground_truth, return_tensors='pt')[0])
print(len_token_input, len_token_output)

ground_truth_ids = tokenizer.encode(ground_truth, return_tensors='pt')
logits = model(ground_truth_ids).logits
print(logits.shape)

logits = original_model(ground_truth_ids).logits
print(logits.shape)

for i in range(len_token_input, len_token_output):
    # compare the two probability distributions of the model with the kl divergence 

27 92
torch.Size([1, 92, 50304])
torch.Size([1, 92, 50304])
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91


In [107]:
range(27,90)

range(27, 90)

In [18]:
generate_text(original_model, original_tokenizer, sentence)

'Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone number is (310) 360-8200. She is residing at the 629-3430 W Pico Boulevard, Bakersfield, CA 93310.\nYou can find wills and more information by clicking here in our database.\nThis is NOT the telephone number of Fredericka Amber. Even though both Fredericka Amber and this phone number are'

In [21]:
input_length = len(tokenizer.encode(sentence, return_tensors='pt')[0])

generate_text(model, tokenizer, sentence, max_length=28)

'Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone number is'

In [33]:
test = generate_text(model, tokenizer, sentence)
input_length = len(tokenizer.encode(sentence, return_tensors='pt')[0])
test = tokenizer.decode(tokenizer.encode(test, return_tensors='pt')[0][input_length:])
test

' number is 889-867-1855. She can be reached at the email address [fredericka\\_amber@me.com](mailto:fredericka_amber@me.com). Her home address is 5611 North 61st Avenue, Louisville, KY, 40258.'

In [26]:
tokenizer.encode(test, return_tensors='pt')[0]

tensor([32655,   254,   781,    66, 41131,   369,  5686,   327,  4565,  3127,
           13, 16648,    15,  4058,  8404,  9044,  1180,   310, 22908,    14,
         1423,    14,    23, 21378,   285,   617,  4481,  1180,   310,   854,
         2511,    14,    25,  2251,    14,  1093,  2417,    15,  1500,   476,
          320,  4925,   387,   253,  4579,  2953,   544,  1592,    90,    14,
         7582,   880,  5719,   936,    27,  1592,    90,    14,  7582,    33,
         1405,    15,   681,   481,  4058,  1728,  2953,   310,  8026,   883,
         3729,  9901,   296, 14216,    13, 39492,    13, 47500,    13,  3387,
        22029,    15])

In [27]:
test

'Fredericka Amber was born on December 21, 1969. Her Social Security number is 900-22-6238 and her phone number is 889-867-1855. She can be reached at the email address [fery-author](mailto:fery-author@me.com). Her home address is 5611 North 61st Avenue, Louisville, KY, 40258.'