Implementation of AUL from [Unmasking the Mask -- Evaluating Social Biases in Masked Language Models](https://arxiv.org/abs/2104.07496)
And implementation of CrowS-Pairs from Nangia et al. (2020)
Implementation based on https://github. com/kanekomasahiro/evaluate_bias_in_mlm

@InProceedings{Kaneko:AUL:2022,
  author={Masahiro Kaneko and Danushka Bollegala},
  title={Unmasking the Mask -- Evaluating Social Biases in Masked Language Models},
  booktitle = {Proceedings of the 36th AAAI Conference on Artificial Intelligence},
  year      = {2022},
  month     = {February},
  address   = {Vancouver, BC, Canada}
}

Nangia, N., Vania, C., Bhalerao, R., and Bowman, S. R. (2020). CrowS-pairs:
A challenge dataset for measuring social biases in masked language models.
In Webber, B., Cohn, T., He, Y., and Liu, Y., editors, Proceedings of the 2020
Conference on Empirical Methods in Natural Language Processing (EMNLP),
pages 1953–1967, Online. Association for Computational Linguistics.

# Preprocessing Data

In [1]:
import csv
import json
import torch
import difflib


from transformers import AutoModelForMaskedLM, AutoTokenizer, XLNetLMHeadModel
from collections import defaultdict
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def preprocess_crows_pairs():
    '''
    Extract stereotypical and anti-stereotypical sentences from crows-paris.
    '''
    data = []

    with open('aul_data/cp.csv') as f:
        reader = csv.DictReader(f)
        for row in reader:
            example = {}
            direction = row['stereo_antistereo']
            example['direction'] = direction
            example['bias_type'] = row['bias_type']

            example['stereotype'] = row['sent_more']
            example['anti-stereotype'] = row['sent_less']
            data.append(example)

    return data


def preprocess_stereoset():
    '''
    Extract stereotypical and anti-stereotypical sentences from StereoSet.
    '''
    data = []
    data = []

    with open('aul_data/ss.json') as f:
        input = json.load(f)
        for annotations in input['data']['intrasentence']:
            example = {}
            example['bias_type'] = annotations['bias_type']
            for annotation in annotations['sentences']:
                gold_label = annotation['gold_label']
                sentence = annotation['sentence']
                example[gold_label] = sentence
            data.append(example)

    return data

data = preprocess_crows_pairs()
with open("aul_data/paralled_cp.json", 'w') as fw:
        json.dump(data, fw, indent=4)

data = preprocess_stereoset()
with open("aul_data/paralled_ss.json", 'w') as fw:
        json.dump(data, fw, indent=4)

# Code

In [None]:
def aul_load_tokenizer_and_model(model):
    '''
    Load tokenizer and model to evaluate.
    '''
    if model == 'xlnet' or model == 'xlnet-large':
        if model == 'xlnet':
            pretrained_weights = 'xlnet-base-cased'
        elif model == 'xlnet-large':
            pretrained_weights = 'xlnet-large-cased'
        model = XLNetLMHeadModel.from_pretrained(pretrained_weights)
        tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)


    else:
        if model == 'bert':
            pretrained_weights = 'bert-base-cased'
        elif model == "bert-large":
            pretrained_weights = 'bert-large-cased'
        elif model == "roberta":
            pretrained_weights = 'roberta-base'
        elif model == "roberta-large":
            pretrained_weights = 'roberta-large'
        else:
            pretrained_weights = model
        model = AutoModelForMaskedLM.from_pretrained(pretrained_weights,
                                                    output_hidden_states=True,
                                                    output_attentions=True)
        tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)

    model = model.eval()
    if torch.cuda.is_available():
        model.to('cuda')

    return tokenizer, model

def aul_get_rank_for_gold_token(log_probs, token_ids):
    '''
    Get rank for gold token from log probability.
    '''
    sorted_indexes = torch.sort(log_probs, dim=1, descending=True)[1]
    ranks = torch.where(sorted_indexes == token_ids)[1] + 1
    ranks = ranks.tolist()

    return ranks

def calculate_aul(model, token_ids, log_softmax, attention):
    '''
    Given token ids of a sequence, return the averaged log probability of
    unmasked sequence (AULA or AUL).
    '''
    output = model(token_ids)
    logits = output.logits.squeeze(0)
    log_probs = log_softmax(logits)
    token_ids = token_ids.view(-1, 1).detach()
    token_log_probs = log_probs.gather(1, token_ids)[1:-1]
    if attention:
        attentions = torch.mean(torch.cat(output.attentions, 0), 0)
        averaged_attentions = torch.mean(attentions, 0)
        averaged_token_attentions = torch.mean(averaged_attentions, 0)
        token_log_probs = token_log_probs.squeeze(1) * averaged_token_attentions[1:-1]
    sentence_log_prob = torch.mean(token_log_probs)
    score = sentence_log_prob.item()

    ranks = aul_get_rank_for_gold_token(log_probs, token_ids)

    return score, ranks

def get_span(seq1, seq2, operation):
    '''
    Extract spans that are shared or diffirent between two sequences.

    Parameters
    ----------
    operation: str
        You can select "equal" which extract spans that are shared between
        two sequences or "diff" which extract spans that are diffirent between
        two sequences.
    '''
    seq1 = [str(x) for x in seq1.tolist()]
    seq2 = [str(x) for x in seq2.tolist()]

    matcher = difflib.SequenceMatcher(None, seq1, seq2)
    template1, template2 = [], []
    for op in matcher.get_opcodes():
        if (operation == 'equal' and op[0] == 'equal') \
                or (operation == 'diff' and op[0] != 'equal'):
            template1 += [x for x in range(op[1], op[2], 1)]
            template2 += [x for x in range(op[3], op[4], 1)]

    return template1, template2

def get_rank_for_gold_token(log_probs, token_ids):
    '''
    Get rank for gold token from log probability.
    '''
    sorted_indexes = torch.sort(log_probs, dim=1, descending=True)[1]
    ranks = torch.where(sorted_indexes == token_ids)[1] + 1
    ranks = ranks.tolist()

    return ranks

def calculate_cps(model, token_ids, spans, mask_id, log_softmax):
    '''
    Given token ids of a sequence, return the summed log probability of
    masked shared tokens between sequence pair (CPS).
    '''
    spans = spans[1:-1]
    masked_token_ids = token_ids.repeat(len(spans), 1)
    masked_token_ids[range(masked_token_ids.size(0)), spans] = mask_id
    hidden_states = model(masked_token_ids)
    hidden_states = hidden_states[0]
    token_ids = token_ids.view(-1)[spans]
    log_probs = log_softmax(hidden_states[range(hidden_states.size(0)), spans, :])
    span_log_probs = log_probs[range(hidden_states.size(0)), token_ids]
    score = torch.sum(span_log_probs).item()

    ranks = get_rank_for_gold_token(log_probs, token_ids.view(-1, 1))

    return score, ranks

def aul_evaluate(model, data_type, output, method = "aul"):
    '''
    Evaluate the bias in masked language models.
    '''
    tokenizer, model = aul_load_tokenizer_and_model(model)
    total_score = 0
    stereo_score = 0

    if torch.cuda.is_available():
        torch.set_default_tensor_type('torch.cuda.FloatTensor')

    mask_id = tokenizer.mask_token_id
    log_softmax = torch.nn.LogSoftmax(dim=1)
    vocab = tokenizer.get_vocab()
    count = defaultdict(int)
    scores = defaultdict(int)
    all_ranks = []
    data = []

    with open(f'aul_data/paralled_{data_type}.json') as f:
        inputs = json.load(f)
        total_num = len(inputs)
        for input in tqdm(inputs):
            bias_type = input['bias_type']
            if bias_type == "gender":
              count[bias_type] += 1

              pro_sentence = input['stereotype']
              pro_token_ids = tokenizer.encode(pro_sentence, return_tensors='pt')
              anti_sentence = input['anti-stereotype']
              anti_token_ids = tokenizer.encode(anti_sentence, return_tensors='pt')

              with torch.no_grad():
                  if method == 'aul':
                    attention = False
                    pro_score, pro_ranks = calculate_aul(model, pro_token_ids, log_softmax, attention)
                    anti_score, anti_ranks = calculate_aul(model, anti_token_ids, log_softmax, attention)

                  elif method == 'cps':
                    pro_spans, anti_spans = get_span(pro_token_ids[0],
                                                     anti_token_ids[0], 'equal')
                    pro_score, pro_ranks = calculate_cps(model, pro_token_ids, pro_spans,
                                              mask_id, log_softmax)
                    anti_score, anti_ranks = calculate_cps(model, anti_token_ids, anti_spans,
                                               mask_id, log_softmax)
                    pro_score = round(pro_score, 3)
                    anti_score = round(anti_score, 3)
                    data.append([anti_sentence, pro_sentence, anti_score, pro_score])

                  elif method == 'sss':
                    pro_spans, anti_spans = get_span(pro_token_ids[0],
                                                     anti_token_ids[0], 'diff')
                    pro_score, anti_ranks = calculate_sss(model, pro_token_ids, pro_spans,
                                              mask_id, log_softmax)
                    anti_score, pro_ranks = calculate_sss(model, anti_token_ids, anti_spans,
                                               mask_id, log_softmax)


              all_ranks += anti_ranks
              all_ranks += pro_ranks
              total_score += 1
              if pro_score > anti_score:
                  stereo_score += 1
                  scores[bias_type] += 1

    fw = open(output, 'w')
    bias_score = round((stereo_score / total_score) * 100, 2)
    print('Bias score:', bias_score)
    fw.write(f'Bias score: {bias_score}\n')
    for bias_type, score in sorted(scores.items()):
        bias_score = round((score / count[bias_type]) * 100, 2)
        print(bias_type, bias_score)
        fw.write(f'{bias_type}: {bias_score}\n')
    all_ranks = [rank for rank in all_ranks if rank != -1]
    accuracy = sum([1 for rank in all_ranks if rank == 1]) / len(all_ranks)
    accuracy *= 100
    print(f'Accuracy: {accuracy:.2f}')
    fw.write(f'Accuracy: {accuracy:.2f}\n')


# Running AUL

In [None]:
aul_evaluate("bert", "cp", "results/aul/bert-cp.txt")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1508/1508 [01:49<00:00, 13.77it/s]

Bias score: 53.05
gender 53.05
Accuracy: 81.78





In [None]:
aul_evaluate("bert", "ss", "results/aul/bert-ss.txt")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2106/2106 [01:34<00:00, 22.20it/s]

Bias score: 49.8
gender 49.8
Accuracy: 74.71





In [None]:
aul_evaluate("bert-large", "cp", "results/aul/bert-large-cp.txt")

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

100%|██████████| 1508/1508 [04:22<00:00,  5.75it/s]

Bias score: 51.53
gender 51.53
Accuracy: 78.46





In [None]:
aul_evaluate("bert-large", "ss", "results/aul/bert-large-ss.txt")

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2106/2106 [03:34<00:00,  9.82it/s]

Bias score: 53.33
gender 53.33
Accuracy: 72.67





In [None]:
aul_evaluate("roberta", "cp", "results/aul/roberta-cp.txt")

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [02:16<00:00, 11.08it/s]

Bias score: 56.87
gender 56.87
Accuracy: 99.60





In [None]:
aul_evaluate("roberta", "ss", "results/aul/roberta-ss.txt")

100%|██████████| 2106/2106 [01:45<00:00, 19.89it/s]

Bias score: 64.71
gender 64.71
Accuracy: 99.48





In [None]:
aul_evaluate("roberta-large", "cp", "results/aul/roberta-large-cp.txt")

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [04:55<00:00,  5.11it/s]

Bias score: 58.02
gender 58.02
Accuracy: 99.65





In [None]:
aul_evaluate("roberta-large", "ss", "results/aul/roberta-large-ss.txt")

100%|██████████| 2106/2106 [04:03<00:00,  8.66it/s]

Bias score: 62.75
gender 62.75
Accuracy: 99.37





In [None]:
aul_evaluate("xlnet", "cp", "results/aul/xlnet-cp.txt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [03:00<00:00,  8.36it/s]

Bias score: 49.62
gender 49.62
Accuracy: 74.25





In [None]:
aul_evaluate("xlnet", "ss", "results/aul/xlnet-ss.txt")

100%|██████████| 2106/2106 [02:15<00:00, 15.58it/s]

Bias score: 54.9
gender 54.9
Accuracy: 62.65





In [None]:
aul_evaluate("xlnet-large", "cp", "results/aul/xlnet-large-cp.txt")

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [08:22<00:00,  3.00it/s]

Bias score: 49.62
gender 49.62
Accuracy: 63.08





In [None]:
aul_evaluate("xlnet-large", "ss", "results/aul/xlnet-large-ss.txt")

100%|██████████| 2106/2106 [07:21<00:00,  4.77it/s]

Bias score: 55.69
gender 55.69
Accuracy: 53.13





# Runing CrowS-Pairs

In [None]:
aul_evaluate("bert", "cp", "results/crowspairs/bert.txt","cps")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1508/1508 [09:10<00:00,  2.74it/s]

Bias score: 57.63
gender 57.63
Accuracy: 65.40





In [None]:
aul_evaluate("bert-large", "cp","results/crowspairs/bert-large.txt", "cps")

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

100%|██████████| 1508/1508 [27:23<00:00,  1.09s/it]

Bias score: 59.16
gender 59.16
Accuracy: 67.82





In [None]:
aul_evaluate("roberta","cp", "results/crowspairs/roberta.txt", "cps")

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [10:06<00:00,  2.49it/s]

Bias score: 54.96
gender 54.96
Accuracy: 66.44





In [None]:
aul_evaluate("roberta-large", "cp", "results/crowspairs/roberta-large.txt", "cps")

100%|██████████| 1508/1508 [28:43<00:00,  1.14s/it]

Bias score: 58.4
gender 58.4
Accuracy: 71.33





In [None]:
aul_evaluate("xlnet", "cp", "results/crowspairs/xlnet.txt","cps")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [12:13<00:00,  2.06it/s]

Bias score: 50.38
gender 50.38
Accuracy: 5.91





In [None]:
aul_evaluate("xlnet-large", "cp", "results/crowspairs/xlnet-large.txt","cps")

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

100%|██████████| 1508/1508 [36:39<00:00,  1.46s/it]

Bias score: 49.62
gender 49.62
Accuracy: 14.01



