Implementation of SEAT from the paper "[On Measuring Social Biases in Sentence Encoders](https://arxiv.org/abs/1903.10561)" by May et al.
Implementation based on https://github.com/W4ngatang/sent-bias

# Code

In [None]:
import math
import json
import scipy.stats
import csv

import numpy as np
import logging as log
import itertools as it

from transformers import BertModel, RobertaModel, BertTokenizer, RobertaTokenizer, XLNetModel, XLNetTokenizer


In [None]:
# X and Y are two sets of target words of equal size.
# A and B are two sets of attribute words.


def cossim(x, y):
    return np.dot(x, y) / math.sqrt(np.dot(x, x) * np.dot(y, y))

def construct_cossim_lookup(XY, AB):
    """
    XY: mapping from target string to target vector (either in X or Y)
    AB: mapping from attribute string to attribute vectore (either in A or B)
    Returns an array of size (len(XY), len(AB)) containing cosine similarities
    between items in XY and items in AB.
    """

    cossims = np.zeros((len(XY), len(AB)))
    for xy in XY:
        for ab in AB:
            cossims[xy, ab] = cossim(XY[xy], AB[ab])
    return cossims

def s_wAB(A, B, cossims):
    """
    Return vector of s(w, A, B) across w, where
        s(w, A, B) = mean_{a in A} cos(w, a) - mean_{b in B} cos(w, b).
    """
    return cossims[:, A].mean(axis=1) - cossims[:, B].mean(axis=1)


def s_XAB(X, s_wAB_memo):
    r"""
    Given indices of target concept X and precomputed s_wAB values,
    return slightly more computationally efficient version of WEAT
    statistic for p-value computation.

    Caliskan defines the WEAT statistic s(X, Y, A, B) as
        sum_{x in X} s(x, A, B) - sum_{y in Y} s(y, A, B)
    where s(w, A, B) is defined as
        mean_{a in A} cos(w, a) - mean_{b in B} cos(w, b).
    The p-value is computed using a permutation test on (X, Y) over all
    partitions (X', Y') of X union Y with |X'| = |Y'|.

    However, for all partitions (X', Y') of X union Y,
        s(X', Y', A, B)
      = sum_{x in X'} s(x, A, B) + sum_{y in Y'} s(y, A, B)
      = C,
    a constant.  Thus
        sum_{x in X'} s(x, A, B) + sum_{y in Y'} s(y, A, B)
      = sum_{x in X'} s(x, A, B) + (C - sum_{x in X'} s(x, A, B))
      = C + 2 sum_{x in X'} s(x, A, B).

    By monotonicity,
        s(X', Y', A, B) > s(X, Y, A, B)
    if and only if
        [s(X', Y', A, B) - C] / 2 > [s(X, Y, A, B) - C] / 2,
    that is,
        sum_{x in X'} s(x, A, B) > sum_{x in X} s(x, A, B).
    Thus we only need use the first component of s(X, Y, A, B) as our
    test statistic.
    """
    return s_wAB_memo[X].sum()


def s_XYAB(X, Y, s_wAB_memo):
    r"""
    Given indices of target concept X and precomputed s_wAB values,
    the WEAT test statistic for p-value computation.
    """
    return s_XAB(X, s_wAB_memo) - s_XAB(Y, s_wAB_memo)

def p_val_permutation_test(X, Y, A, B, n_samples, cossims, parametric=False):
    ''' Compute the p-val for the permutation test, which is defined as
        the probability that a random even partition X_i, Y_i of X u Y
        satisfies P[s(X_i, Y_i, A, B) > s(X, Y, A, B)]
    '''
    X = np.array(list(X), dtype=np.int64)
    Y = np.array(list(Y), dtype=np.int64)
    A = np.array(list(A), dtype=np.int64)
    B = np.array(list(B), dtype=np.int64)

    assert len(X) == len(Y)
    size = len(X)
    s_wAB_memo = s_wAB(A, B, cossims=cossims)
    XY = np.concatenate((X, Y))

    if parametric:
        log.info('Using parametric test')
        s = s_XYAB(X, Y, s_wAB_memo)

        log.info('Drawing {} samples'.format(n_samples))
        samples = []
        for _ in range(n_samples):
            np.random.shuffle(XY)
            Xi = XY[:size]
            Yi = XY[size:]
            assert len(Xi) == len(Yi)
            si = s_XYAB(Xi, Yi, s_wAB_memo)
            samples.append(si)

        # Compute sample standard deviation and compute p-value by
        # assuming normality of null distribution
        log.info('Inferring p-value based on normal distribution')
        (shapiro_test_stat, shapiro_p_val) = scipy.stats.shapiro(samples)
        log.info('Shapiro-Wilk normality test statistic: {:.2g}, p-value: {:.2g}'.format(
            shapiro_test_stat, shapiro_p_val))
        sample_mean = np.mean(samples)
        sample_std = np.std(samples, ddof=1)
        log.info('Sample mean: {:.2g}, sample standard deviation: {:.2g}'.format(
            sample_mean, sample_std))
        p_val = scipy.stats.norm.sf(s, loc=sample_mean, scale=sample_std)
        return p_val

    else:
        log.info('Using non-parametric test')
        s = s_XAB(X, s_wAB_memo)
        total_true = 0
        total_equal = 0
        total = 0

        num_partitions = int(scipy.special.binom(2 * len(X), len(X)))
        if num_partitions > n_samples:
            # We only have as much precision as the number of samples drawn;
            # bias the p-value (hallucinate a positive observation) to
            # reflect that.
            total_true += 1
            total += 1
            log.info('Drawing {} samples (and biasing by 1)'.format(n_samples - total))
            for _ in range(n_samples - 1):
                np.random.shuffle(XY)
                Xi = XY[:size]
                assert 2 * len(Xi) == len(XY)
                si = s_XAB(Xi, s_wAB_memo)
                if si > s:
                    total_true += 1
                elif si == s:  # use conservative test
                    total_true += 1
                    total_equal += 1
                total += 1

        else:
            log.info('Using exact test ({} partitions)'.format(num_partitions))
            for Xi in it.combinations(XY, len(X)):
                Xi = np.array(Xi, dtype=np.int64)
                assert 2 * len(Xi) == len(XY)
                si = s_XAB(Xi, s_wAB_memo)
                if si > s:
                    total_true += 1
                elif si == s:  # use conservative test
                    total_true += 1
                    total_equal += 1
                total += 1

        if total_equal:
            log.warning('Equalities contributed {}/{} to p-value'.format(total_equal, total))

        return total_true / total


def mean_s_wAB(X, A, B, cossims):
    return np.mean(s_wAB(A, B, cossims[X]))


def stdev_s_wAB(X, A, B, cossims):
    return np.std(s_wAB(A, B, cossims[X]), ddof=1)


def effect_size(X, Y, A, B, cossims):
    """
    Compute the effect size, which is defined as
        [mean_{x in X} s(x, A, B) - mean_{y in Y} s(y, A, B)] /
            [ stddev_{w in X u Y} s(w, A, B) ]
    args:
        - X, Y, A, B : sets of target (X, Y) and attribute (A, B) indices
    """
    X = list(X)
    Y = list(Y)
    A = list(A)
    B = list(B)

    numerator = mean_s_wAB(X, A, B, cossims=cossims) - mean_s_wAB(Y, A, B, cossims=cossims)
    denominator = stdev_s_wAB(X + Y, A, B, cossims=cossims)
    return numerator / denominator


def convert_keys_to_ints(X, Y):
    return (
        dict((i, v) for (i, (k, v)) in enumerate(X.items())),
        dict((i + len(X), v) for (i, (k, v)) in enumerate(Y.items())),
    )


def weat_run_test(encs, n_samples, parametric=False):
    ''' Run a WEAT.
    args:
        - encs (Dict[str: Dict]): dictionary mapping targ1, targ2, attr1, attr2
            to dictionaries containing the category and the encodings
        - n_samples (int): number of samples to draw to estimate p-value
            (use exact test if number of permutations is less than or
            equal to n_samples)
    '''
    X, Y = encs["targ1"]["encs"], encs["targ2"]["encs"]
    A, B = encs["attr1"]["encs"], encs["attr2"]["encs"]

    # First convert all keys to ints to facilitate array lookups
    (X, Y) = convert_keys_to_ints(X, Y)
    (A, B) = convert_keys_to_ints(A, B)

    XY = X.copy()
    XY.update(Y)
    AB = A.copy()
    AB.update(B)

    log.info("Computing cosine similarities...")
    cossims = construct_cossim_lookup(XY, AB)

    log.info("Null hypothesis: no difference between %s and %s in association to attributes %s and %s",
             encs["targ1"]["category"], encs["targ2"]["category"],
             encs["attr1"]["category"], encs["attr2"]["category"])
    log.info("Computing pval...")
    pval = p_val_permutation_test(X, Y, A, B, n_samples, cossims=cossims, parametric=parametric)
    log.info("pval: %g", pval)

    log.info("computing effect size...")
    esize = effect_size(X, Y, A, B, cossims=cossims)
    log.info("esize: %g", esize)
    return esize, pval



In [None]:
def load_json(sent_file):
    ''' Load from json. We expect a certain format later, so do some post processing '''
    log.info("Loading %s..." % sent_file)
    all_data = json.load(open(sent_file, 'r'))
    data = {}
    for k, v in all_data.items():
        examples = v["examples"]
        data[k] = examples
        v["examples"] = examples
    return all_data  # data

def seat_encode(model, tokenizer, texts, modelname):
    ''' Use tokenizer and model to encode texts '''
    encs = {}
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt")
        outputs = model(**inputs, output_hidden_states=True)
        if modelname == "xlnet" or modelname == "xlnet-large":
            last_hidden_states = outputs.last_hidden_state
            # last_hidden_states[0]: first batch, last_hidden_states[0][-1]: embedding of [CLS], represents entire sentence (for XLNet)
            encs[text]= last_hidden_states[0][-1].cpu().detach().numpy()
        else:
            last_hidden_states = outputs.last_hidden_state
            # last_hidden_states[0]: first batch, last_hidden_states[0][0]: embedding of [CLS], represents entire sentence (for bert and roberta)
            encs[text]= last_hidden_states[0][0].cpu().detach().numpy()
    return encs

def seat_evaluate(model_name, results_path):
    tests = ["seat_data/sent-weat6.jsonl", "seat_data/sent-weat6b.jsonl",
             "seat_data/sent-weat7.jsonl", "seat_data/sent-weat7b.jsonl",
             "seat_data/sent-weat8.jsonl", "seat_data/sent-weat8b.jsonl"]

    results = []
    for test in tests:
        # load the test data
        encs = load_json(test)

        # load the model and do model-specific encoding procedure
        if model_name == "bert":
            tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            model = BertModel.from_pretrained('bert-base-cased')
        elif model_name == "bert-large":
            tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
            model = BertModel.from_pretrained('bert-large-cased')
        elif model_name == "roberta":
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            model = RobertaModel.from_pretrained('roberta-base')
        elif model_name == "roberta-large":
            tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
            model = RobertaModel.from_pretrained('roberta-large')
        elif model_name == "xlnet":
            model = XLNetModel.from_pretrained('xlnet-base-cased')
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
            tokenizer.padding_side = "right"
        elif model_name == "xlnet-large":
            model = XLNetModel.from_pretrained('xlnet-large-cased')
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
            tokenizer.padding_side = "right"

        model.eval()

        encs_targ1 = seat_encode(model, tokenizer, encs["targ1"]["examples"], model_name)
        encs_targ2 = seat_encode(model, tokenizer, encs["targ2"]["examples"], model_name)
        encs_attr1 = seat_encode(model, tokenizer, encs["attr1"]["examples"], model_name)
        encs_attr2 = seat_encode(model, tokenizer, encs["attr2"]["examples"], model_name)

        encs["targ1"]["encs"] = encs_targ1
        encs["targ2"]["encs"] = encs_targ2
        encs["attr1"]["encs"] = encs_attr1
        encs["attr2"]["encs"] = encs_attr2

        enc = [e for e in encs["targ1"]['encs'].values()][0]
        d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc)

        # run the test on the encodings
        esize, pval = weat_run_test(encs, n_samples=100000)
        results.append(dict(
            model=model_name,
            test=test,
            p_value=pval,
            effect_size=esize,
            num_targ1=len(encs['targ1']['encs']),
            num_targ2=len(encs['targ2']['encs']),
            num_attr1=len(encs['attr1']['encs']),
            num_attr2=len(encs['attr2']['encs'])))

    sum_of_effect_sizes = 0
    if results_path is not None:
        log.info('Writing results to {}'.format(results_path))
        with open(results_path, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results[0].keys(), delimiter='\t')
            writer.writeheader()
            for r in results:
                writer.writerow(r)
    return results


# Running SEAT

In [None]:
results = seat_evaluate("bert", "results/seat/bert.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/bert-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
results = seat_evaluate("bert-large", "results/seat/bert-large.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/bert-large-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
results = seat_evaluate("roberta", "results/seat/roberta.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/roberta-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

In [None]:
results = seat_evaluate("roberta-large", "results/seat/roberta-large.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/roberta-large-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [None]:
results = seat_evaluate("xlnet", "results/seat/xlnet.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/xlnet-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]



In [None]:
results = seat_evaluate("xlnet-large", "results/seat/xlnet-large.csv")
sum_es = 0
for r in results:
  sum_es += r["effect_size"]
with open("results/seat/xlnet-large-mean.txt", 'w') as f:
  f.write("mean effect size: "+str(sum_es/6))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

