# CMP STEP 3: Measuring Item Bias

In [1]:
import numpy as np
import pandas as pd

import os
import json
from tqdm import tqdm

from scipy.spatial.distance import cosine as cosdist
import scipy.stats as st

import matplotlib.pyplot as plt

In [2]:
EMBEDDINGS = 'data/'
BIAS = 'bias/'
MODE = 'masked'  # change mode to either 'normal' or 'masked'

### Load vignette & strategy embeddings

In [3]:
# load pre-encoded vignette embeddings
vignette_embeddings = np.load(EMBEDDINGS + 'vignette_embeddings.npy')

In [4]:
# load pre-encoded strategy embeddings
if MODE == 'normal':
    strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings.npy')
elif MODE == 'masked':
    strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings_masked.npy')

### WEAT (Caliskan et al., 2017) adapted one-vs-many

In [5]:
strategies_per_item = [0, 679, 680, 725, 718, 689, 671, 571, 650, 663]

In [6]:
def WEAT_one_vs_all(item_no=1, random_seed=15):
    """calculates the WEAT statistic for bias dimension item_no-all_other_items:
       Are answers to item_no closer to vignette of item_no in the embedding space
       compared to the vignettes of all other items (averaged)?"""    
    start = np.sum(strategies_per_item[:item_no])
    stop = start + strategies_per_item[item_no]
    other_items = np.delete(strategy_embeddings, range(start, stop), axis=0)
    
    rng = np.random.RandomState(random_seed)
    random_idx = rng.randint(strategy_embeddings.shape[0]-strategies_per_item[item_no],
                             size=strategies_per_item[item_no])
    
    X = strategy_embeddings[start:stop]
    Y = other_items[random_idx]

    A = vignette_embeddings[item_no - 1]
    B = np.delete(vignette_embeddings, (item_no - 1), axis=0)
    
    s_WAB = []
    
    s_XAB = 0
    for sentence in X:
        inner_item_dist = cosdist(sentence, A)
        cross_item_dist = np.mean([cosdist(sentence, B[x]) for x in range(len(B))])
        s_wAB = inner_item_dist - cross_item_dist
        s_WAB.append(s_wAB)
        s_XAB += s_wAB
    mean_s_XAB = s_XAB / len(X)

    s_YAB = 0
    for sentence in Y:
        cross_item_dist = cosdist(sentence, A)
        inner_item_dist = np.mean([cosdist(sentence, B[x]) for x in range(len(B))])
        s_wAB = cross_item_dist - inner_item_dist
        s_WAB.append(s_wAB)
        s_YAB += s_wAB
    mean_s_YAB = s_YAB / len(Y)

    s_XYAB = s_XAB - s_YAB
    
    effect_size = (mean_s_XAB - mean_s_YAB) / np.std(s_WAB)

    return s_XYAB, effect_size

### Calculate item bias statistics & effect sizes, save and display

In [7]:
# calculate WEAT statistics for all items: includes (bootstrapped bias statistic, bootstrapped effect size)
# save to dictionary/json

# bias_results = {}

# for item in range(1, 10):
#     compute statistic for 1000 times with different seeds to produce confidence interval
#     because of random drawing of 'other item's answers'-set
#     print(f'Bootstrapping for item {item}...')
#     bias_boot = []
#     es_boot = []
#     
#         WEAT = WEAT_one_vs_all(item_no=item, random_seed=seed)
#         bias_boot.append(WEAT[0])
#         es_boot.append(WEAT[1])
    
#     bias_results[item] = (bias_boot, es_boot)

In [8]:
# save bias results to file
# with open(BIAS + f'bias_boot_{MODE}.json', 'w') as f:
    # f.write(json.dumps(bias_results))

In [9]:
# reload pre-computed bias results
with open(BIAS + f'bias_boot_{MODE}.json', 'r') as f:
    bias_results = json.loads(f.read())

In [10]:
# display bias results in table
df_one_vs_all = pd.DataFrame(columns=['statistic (mean)', 'CI (95%)'], index= range(1, 10))

for item in range(1, 10):
    mu, sigma = st.norm.fit(bias_results[str(item)][0])
    lower_95 = mu + sigma * st.norm.ppf(0.025, loc=mu, scale=sigma)
    upper_95 = mu - sigma * st.norm.ppf(0.025, loc=mu, scale=sigma)
    
    df_one_vs_all.loc[item]['statistic (mean)'] = f'{mu:.2f}'
    df_one_vs_all.loc[item]['CI (95%)'] = f'[{lower_95:.2f}; {upper_95:.2f}]'

df_one_vs_all

Unnamed: 0,statistic (mean),CI (95%)
1,-0.65,[-0.74; -0.56]
2,-1.53,[-1.64; -1.42]
3,-2.4,[-2.74; -2.05]
4,-1.16,[-1.25; -1.06]
5,-1.99,[-2.15; -1.83]
6,-0.69,[-0.73; -0.65]
7,-0.68,[-0.74; -0.62]
8,-1.07,[-1.16; -0.99]
9,-1.49,[-1.67; -1.31]


In [11]:
# display effect size results in table
df_one_vs_all = pd.DataFrame(columns=['effect size (mean)', 'CI (95%)'], index= range(1, 10))

for item in range(1, 10):
    mu, sigma = st.norm.fit(bias_results[str(item)][1])
    lower_95 = mu + sigma * st.norm.ppf(0.025, loc=mu, scale=sigma)
    upper_95 = mu - sigma * st.norm.ppf(0.025, loc=mu, scale=sigma)
    
    df_one_vs_all.loc[item]['effect size (mean)'] = f'{mu:.2f}'
    df_one_vs_all.loc[item]['CI (95%)'] = f'[{lower_95:.2f}; {upper_95:.2f}]'

df_one_vs_all

Unnamed: 0,effect size (mean),CI (95%)
1,-0.25,[-0.26; -0.23]
2,-0.75,[-0.77; -0.72]
3,-0.66,[-0.68; -0.63]
4,-0.5,[-0.52; -0.49]
5,-0.85,[-0.88; -0.83]
6,-0.5,[-0.53; -0.48]
7,-0.36,[-0.37; -0.34]
8,-0.57,[-0.59; -0.54]
9,-0.53,[-0.55; -0.50]
