# Comparing bias measures on multiple model checkpoints

In this notebook, we loop over the model checkpoints in a directory and assess its bias using different measures (which need to be configured first).

Adapt the file paths and execute this cell to mount Google Drive (needed if saved checkpoints are to be used).

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/My Drive/NLP2_proj2/experiments"
!ls

Install dependencies. Only *transformers* and *datasets* are needed. Additionally, *torch* needs to be installed if not available on your system.

In [None]:
!pip install transformers==4.28.0
!pip install datasets

Execute this cell to import dependencies.

In [None]:
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline

import torch
import os
import pandas as pd

Load and preprocess CrowS-Pairs.

In [None]:
crows = load_dataset("BigScienceBiasEval/crows_pairs_multilingual")
crows = crows["test"]

print(crows)

crows_types = {row["bias_type"] for row in crows}
print(crows_types)

crows_sup = crows.filter(lambda example: example["bias_type"] in ["nationality", "race-color", "religion"])
print(crows_sup)

Load and preprocess StereoSet.

In [None]:
stereoset = load_dataset("stereoset", "intrasentence")
stereoset = stereoset["validation"]
print(stereoset)
def remap_stereoset(example):
    sents = example["sentences"]["sentence"]
    labels = example["sentences"]["gold_label"]

    zipped = list(zip(sents, labels))
    sorted_zipped = sorted(zipped, key=lambda x: x[1])
    sents = [sent for sent, _ in sorted_zipped]
    
    example["sent_more"] = sents[1]
    example["sent_less"] = sents[0]
    return example

stereoset = stereoset.map(remap_stereoset, remove_columns=["id", "target", "context", "sentences"])

stereoset_types = {row["bias_type"] for row in stereoset}
print(stereoset_types)

stereoset_sup = stereoset.filter(lambda example: example["bias_type"] in ["religion", "race"])
print(stereoset_sup)

Load and preprocess WinoBias.

In [None]:
wino_pro = load_dataset("wino_bias", "type1_pro")
wino_anti = load_dataset("wino_bias", "type1_anti")

wino_pro = concatenate_datasets([wino_pro["validation"], wino_pro["test"]])
wino_anti = concatenate_datasets([wino_anti["validation"], wino_anti["test"]])

wino_pro = list(" ".join(row["tokens"]) for row in wino_pro)
wino_anti = list(" ".join(row["tokens"]) for row in wino_anti)

wino = {
    "sent_more": wino_pro,
    "sent_less": wino_anti
}

wino = Dataset.from_dict(wino)
print(wino)

Define two functions:
* *evaluation* calculates a model's perplexity on a pair of sequences and assesses which sequence is preferred by the model (the one whose perplexity is lower!).
* *compute_relative_preference* is a convenience function that computes the relative amount of how often the model prefers sequence 1 or 2, or none of them (if, in a highly rare case, the perplexities should be the same).

In [None]:
def evaluation(example):
    pro = torch.LongTensor(tokenizer.encode(example["sent_more"])).to("cuda")
    anti = torch.LongTensor(tokenizer.encode(example["sent_less"])).to("cuda")
    with torch.no_grad():
        output_pro = model(pro, labels=pro)
        output_anti = model(anti, labels=anti)
    score_pro = -torch.exp(output_pro["loss"])
    score_anti = -torch.exp(output_anti["loss"])
    if score_pro < score_anti:
        example["preference"] = "biased"
    elif score_pro > score_anti:
        example["preference"] = "inv_biased"
    else:
        example["preference"] = "no_preference"
    return example

def compute_relative_preference(ds):
    prefer_pro = sum([1 for row in ds if row["preference"] == "biased"])
    prefer_anti = sum([1 for row in ds if row["preference"] == "inv_biased"])
    no_preference = sum([1 for row in ds if row["preference"] == "no_preference"])
    total = len(ds)
    print(f"  Preferred pro: {prefer_pro / total} / Preferred anti: {prefer_anti / total} / Preferred none: {no_preference / total}")
    return prefer_pro, prefer_anti, no_preference, total

Run the evaluation by looping through the models (so that all of them only need to be loaded once) and assessing the model on each of the datasets / measures.\
(Note that this implementation treats the subsets as distinct datasets and, therefore, lets the model calculate its perplexity on some sentences more than once. Computing times could, therefore, be reduced by separating the subset only after evaluation).

In [None]:
## run evaluation

model_path = "./models"
model_names = [os.path.join(model_path, model_name) for model_name in os.listdir(model_path) if model_name != "runs"]
model_names.insert(0, "gpt2")

tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

scores_per_model = dict()

for model_name in model_names:
    print(f"Processing {model_name}...")
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    scores_per_model[model_name] = dict()

    print(f"  Evaluating on CrowS...")
    crows = crows.map(evaluation)
    scores_per_model[model_name]["crows"] = compute_relative_preference(crows)
    print(f"  Evaluating on CrowS_sup...")
    crows_sup = crows_sup.map(evaluation)
    scores_per_model[model_name]["crows_sup"] = compute_relative_preference(crows_sup)
    print(f"  Evaluating on StereoSet...")
    stereoset = stereoset.map(evaluation)
    scores_per_model[model_name]["stereoset"] = compute_relative_preference(stereoset)
    print(f"  Evaluating on StereoSet_sup...")
    stereoset_sup = stereoset_sup.map(evaluation)
    scores_per_model[model_name]["stereoset_sup"] = compute_relative_preference(stereoset_sup)
    print(f"  Evaluating on WinoBias...")
    wino = wino.map(evaluation)
    scores_per_model[model_name]["wino"] = compute_relative_preference(wino)

The above cell saved all scores to a nested dictionary. Use one of the below code blocks to display the scores either per model or per measure.



In [None]:
measures = ["crows", "crows_sup", "stereoset", "stereoset_sup", "wino"]

for i, measure in enumerate(measures):
    print(f"Measure: {measure}")
    for model in scores_per_model.keys():
        values = scores_per_model[model][measure]
        total = values[3]
        print(f"  {(round(values[0]/total, 3), round(values[1]/total, 3), values[2]/total, total)}")
        ##print(f"  {scores_per_model[model][measure]}")

#for model_name in model_names:
#    print("-" * 10)
#    print(model_name)
#    for measure in scores_per_model[model_name]:
#        print(measure)
#        values = scores_per_model[model_name][measure]
#        total = values[3]
#        print(f"Scores for {measure}:\n  Preferred pro: {values[0] / total} / Preferred anti: {values[1] / total} / Preferred none: {values[2] / total}")