In [None]:
!pip install -q transformers datasets rouge_score

In [2]:
import json
import torch
import random
import datasets
import numpy as np
import pandas as pd

from tqdm import tqdm
from accelerate import Accelerator
from collections import defaultdict
from statistics import mean, harmonic_mean
from rouge_score import rouge_scorer
from sklearn.metrics import roc_curve, auc
from transformers import AutoTokenizer, AutoModelForCausalLM

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## args

In [3]:
data_path = '/content/drive/MyDrive/GIL/Unlearning/data/validation/'
checkpoint_path = '/content/drive/MyDrive/GIL/Unlearning/fine_tune_retain'
mia_data_path = '/content/drive/MyDrive/GIL/Unlearning/mia_data/'
output_dir = '/content/drive/MyDrive/GIL/Unlearning/'
model = '/content/drive/MyDrive/GIL/Unlearning/fine_tune_retain' #allenai/OLMo-1B-0724-hf'
mmlu_metrics_file_path = None
max_new_tokens = 256
debug = False
keep_files = False
compute_metrics_only = False

## Functions

In [17]:
def inference( model, tokenizer):
    forget_file = data_path + 'forget.jsonl'
    retain_file = data_path + 'retain.jsonl'

    accelerator = Accelerator()
    model.to(accelerator.device)

    for split, train_file in tqdm([('retain', retain_file), ('forget', forget_file)]):
        print('\nlen retain_file', len(retain_file))
        print('len forget_file', len(forget_file))
        data_files = {}
        dataset_args = {}
        if train_file is not None:
            data_files["train"] = train_file
        raw_datasets = datasets.load_dataset(
            "json",
            data_files=data_files,
            **dataset_args,
        )
        train_dataset = raw_datasets["train"]

        output_dic = defaultdict(lambda :{'id': [], 'task': [], 'input': [], 'expected_output': [], 'model_output': [], 'nll': []})

        with accelerator.split_between_processes(train_dataset, apply_padding=True) as data:
          #Agregué esto, no se si sea lo correcto
          print('len data[input]',len(data["input"]))
          for i in range(len(data["input"])):
              questions, answers = list(data["input"][0].values()), list(data["output"][0].values())
              id = list(data["id"][0].values())
              task = list(data["task"][0].values())
              for idx in range(len(questions)):
         #----------
                  #question, answer = data[idx]["input"], data[idx]["output"]
                  question, answer = str(questions[idx]), str(answers[idx])
                  #print(question, type(question), type(answer))
                  output_dic[accelerator.process_index]['id'].append(id[idx])
                  output_dic[accelerator.process_index]['task'].append(task[idx])
                  output_dic[accelerator.process_index]['input'].append(questions[idx])
                  output_dic[accelerator.process_index]['expected_output'].append(answer)
                  input_ids = tokenizer(
                      question,
                      return_tensors='pt',
                      padding=True
                  ).input_ids.to(model.device)

                  combined_input_ids = tokenizer(
                      question+answer,
                      return_tensors='pt',
                  ).input_ids.to(model.device)
                  combined_target_ids = combined_input_ids.clone()
                  combined_target_ids[:,:len(input_ids[0])] = -100

                  with torch.no_grad():
                      out = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False, use_cache=True, pad_token_id=tokenizer.eos_token_id)
                      output_ids = out[:, len(input_ids[0]):]
                      output = tokenizer.batch_decode(
                          output_ids,
                          skip_special_tokens=True,
                          clean_up_tokenization_spaces=True)[0]
                      output_dic[accelerator.process_index]['model_output'].append(output)

                      # For Perplexity
                      out = model(combined_input_ids, labels=combined_target_ids)
                      if debug:
                          print(tokenizer.batch_decode(
                              torch.argmax(
                                  torch.nn.functional.softmax(
                                      torch.tensor(out.logits),
                                      dim=2),
                                  dim=2)[:, len(input_ids[0]):],
                              skip_special_tokens=True,
                              clean_up_tokenization_spaces=True)[0])
                      neg_log_likelihood = out.loss.item()
                      output_dic[accelerator.process_index]['nll'].append(neg_log_likelihood)

        accelerator.wait_for_everyone()

        #if args.debug:
        #    print([len(value) for value in output_dic[accelerator.process_index].values()])
        output_df = pd.DataFrame.from_dict(output_dic[accelerator.process_index])
        output_file_name = f"{output_dir}/{split}_{accelerator.process_index}.csv"
        output_df.to_csv(output_file_name, index=False)

In [18]:
def mia_attacks(model, tokenizer):
    member_file = mia_data_path + 'member.jsonl'
    nonmember_file = mia_data_path + 'nonmember.jsonl'

    accelerator = Accelerator()
    model.to(accelerator.device)

    for dataset, train_file in [('member', member_file), ('nonmember', nonmember_file)]:
        data_files = {}
        dataset_args = {}
        if train_file is not None:
            data_files["train"] = train_file
        raw_datasets = datasets.load_dataset(
            "json",
            data_files=data_files,
            **dataset_args,
        )
        train_dataset = raw_datasets["train"]

        output_dic = defaultdict(lambda :{'id': [], 'nll': []})

        with accelerator.split_between_processes(train_dataset, apply_padding=True) as data:
            for idx in tqdm(range(len(data['document']))):
                document = data["document"][idx]
                output_dic[accelerator.process_index]['id'].append(data["id"][idx])
                input_ids = tokenizer(
                    document,
                    return_tensors='pt'
                ).input_ids.to(model.device)

                target_ids = input_ids.clone()

                with torch.no_grad():
                    out = model(input_ids, labels=target_ids)
                    neg_log_likelihood = out.loss.item()
                    output_dic[accelerator.process_index]['nll'].append(neg_log_likelihood)

        accelerator.wait_for_everyone()

        output_df = pd.DataFrame.from_dict(output_dic[accelerator.process_index])

        #results_dir = os.path.join(args.output_dir, 'mia_results')
        #Path(results_dir).mkdir(parents=True, exist_ok=True)
        output_file_name = f"{output_dir}/mia_results/{dataset}_{accelerator.process_index}.csv"
        #if args.debug:
        #    print('Saving to: ', output_file_name)
        output_df.to_csv(output_file_name, index=False)

In [6]:
def compute_auc(member_loss, nonmember_loss):
    assert not np.any(np.isnan(member_loss))
    assert not np.any(np.isnan(nonmember_loss))
    combined_loss = member_loss + nonmember_loss
    combined_loss = -1 * np.array(combined_loss)
    combined_labels = len(member_loss) * [1] + len(nonmember_loss) * [0]
    fp, tp, _ = roc_curve(combined_labels, combined_loss)

    auc_score = float(auc(fp, tp))

    return auc_score

In [30]:
def compute_metrics():
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    results = {}
    aggregate_scores_list = []
    for split in ['forget', 'retain']:
        #files = glob.glob(output_dir + '/{}_*.csv'.format(split))
        #if len(files) == 0:
        #    print("[ERROR] Missing inference files, rerun script with inference first")
        #    return  # sys.exit(1) throws a long traceback so just return for now
        files = [f'{output_dir}/forget_0.csv', f'{output_dir}/retain_0.csv']
        df_list = [pd.read_csv(f) for f in files]
        #if not args.keep_files:
        #    _ = [os.remove(f) for f in files]
        df = pd.concat(df_list, ignore_index=True)

        df['regurgitation-score-rouge-1'] = None
        df['regurgitation-score'] = None
        df['knowledge-score'] = None
        ground_truths = df['expected_output'].tolist()
        gen_outputs = df['model_output'].tolist()

        for i, (gen, gt) in enumerate(zip(gen_outputs, ground_truths)):
            if df.loc[i, 'id'][:-1].endswith('sc'):
                rouge_scores = scorer.score(str(gt), str(gen))
                df.loc[i, 'regurgitation-score-rouge-1'] = rouge_scores['rouge1'].recall
                df.loc[i, 'regurgitation-score'] = rouge_scores['rougeL'].recall
            elif df.loc[i, 'id'][:-1].endswith('qa'):
                 df.loc[i, 'knowledge-score'] = int(str(gt).strip().lower() == str(gen).strip().lower())

        results[split+'-set'] = {'overall-regurgitation-score': np.mean(df['regurgitation-score']), 'overall-knowledge-score': np.mean(df['knowledge-score'])}
        split_aggregate_scores_dict = df.groupby('task')[['regurgitation-score', 'knowledge-score']].mean().to_dict(orient='index')
        results[split+'-set'].update(split_aggregate_scores_dict)
        split_aggregate_score_values = [float(val) for inner in split_aggregate_scores_dict.values() for val in inner.values()]
        if split == 'forget':
            split_aggregate_score_values = [(1 - val) for val in split_aggregate_score_values]

        aggregate_scores_list.extend(split_aggregate_score_values)

    if mia_data_path is not None:
        mia_results_dir = output_dir+'mia_results'
        mia_results = {}
        for dataset in ['member', 'nonmember']:
            #files = glob.glob(mia_results_dir + '/{}_*.csv'.format(dataset))
            #if len(files) == 0:
            #    print("[ERROR] Missing mia files, rerun script with inference first")
            #    return  # sys.exit(1) throws a long traceback so just return for no
            files = [f'{mia_results_dir}/member_0.csv', f'{mia_results_dir}/nonmember_0.csv']
            df_list = [pd.read_csv(f) for f in files]
            df = pd.concat(df_list, ignore_index=True)
            mia_results[dataset] = df['nll'].tolist()

        #if not keep_files:
        #    shutil.rmtree(mia_results_dir)

        auc = compute_auc(mia_results['member'], mia_results['nonmember'])
        # Best MIA rates we can get are ~0.5. 1 implies model still remembers the forget set
        results['mia_loss_acc'] = auc
#        aggregate_scores_list.append(1 - auc)

    if mmlu_metrics_file_path is not None:
        with open(mmlu_metrics_file_path) as inptr:
            mmlu_scores = json.loads(inptr.read())
        results['mmlu_average'] = mmlu_scores['average_acc']
#        aggregate_scores_list.append(mmlu_scores['average_acc'])

    results['aggregated-terms'] = aggregate_scores_list

    task_aggregate = harmonic_mean(aggregate_scores_list)
    results['harmonic-mean-task-aggregate'] = task_aggregate
    if 'mmlu_average' in results and 'mia_loss_acc' in results:
        results['aggregate-score'] = mean([task_aggregate, results['mmlu_average'], 1 - results['mia_loss_acc']])
    else:   # Need MMLU and MIA scores to compute the aggregate
        results['aggregate-score'] = -1

    #metrics_file = os.path.join(args.output_dir, 'evaluation_results.jsonl')
    #with open(metrics_file, 'w') as outptr:
    #    outptr.write(json.dumps(results))
    return results

## Evaluate

In [8]:
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Set up accelerator
accelerator = Accelerator()
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, torch_dtype=torch.bfloat16, trust_remote_code = True) # .to('cuda')
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [19]:
if not compute_metrics_only:
    inference(model, tokenizer)

  0%|          | 0/2 [00:00<?, ?it/s]


len retain_file 66
len forget_file 66
len data[input] 1


 50%|█████     | 1/2 [12:45<12:45, 765.08s/it]


len retain_file 66
len forget_file 66


Generating train split: 0 examples [00:00, ? examples/s]

len data[input] 1


100%|██████████| 2/2 [26:08<00:00, 784.15s/it]


In [21]:
if mia_data_path is not None:
    mia_attacks(model, tokenizer)

100%|██████████| 250/250 [01:01<00:00,  4.07it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 250/250 [01:03<00:00,  3.94it/s]


In [31]:
if accelerator.is_main_process:
    res = compute_metrics()

In [33]:
res

{'forget-set': {'overall-regurgitation-score': 0.23542884518765667,
  'overall-knowledge-score': 0.0,
  'Task1': {'regurgitation-score': 0.35715384035868203,
   'knowledge-score': 0.0},
  'Task2': {'regurgitation-score': 0.13832551140051963,
   'knowledge-score': 0.0},
  'Task3': {'regurgitation-score': 0.2136399053011814,
   'knowledge-score': 0.0}},
 'retain-set': {'overall-regurgitation-score': 0.23542884518765667,
  'overall-knowledge-score': 0.0,
  'Task1': {'regurgitation-score': 0.35715384035868203,
   'knowledge-score': 0.0},
  'Task2': {'regurgitation-score': 0.13832551140051963,
   'knowledge-score': 0.0},
  'Task3': {'regurgitation-score': 0.2136399053011814,
   'knowledge-score': 0.0}},
 'mia_loss_acc': 0.5,
 'aggregated-terms': [0.642846159641318,
  1.0,
  0.8616744885994804,
  1.0,
  0.7863600946988186,
  1.0,
  0.35715384035868203,
  0.0,
  0.13832551140051963,
  0.0,
  0.2136399053011814,
  0.0],
 'harmonic-mean-task-aggregate': 0,
 'aggregate-score': -1}