In [None]:
import json
import codecs
import numpy as np
import math
import os
import pandas as pd
from operator import itemgetter
import pickle
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Patch
from tqdm import tqdm
import scipy
import yaml
import random
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import torch
from transformers import pipeline
#from utils import create_balanced_occupation_data, create_occupation_data, occupation_stats
from embedding import BertHuggingfaceMLM #, BertHuggingface
from geometrical_bias import SAME, WEAT, GeneralizedWEAT, DirectBias, RIPA, MAC, normalize, cossim, EmbSetList, EmbSet

In [None]:
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}

matplotlib.rc('font', **font)

In [None]:
BASE_DIR = '' # repository base
RESULT_DIR = BASE_DIR+'results/icpram22/'
EXP_CONFIG = RESULT_DIR+'config.yaml'
PRETRAIN_BIAS_RES = 'task_res.csv'
DATA_BIAS_RES = 'train_data_stats.csv'

DATA_FILE = 'train_data.pickle'

In [None]:
with open(EXP_CONFIG, 'rb') as f:
    config = yaml.safe_load(f)
    
config

In [None]:
with open(BASE_DIR+config['template_file'], 'r') as f:
    templates = yaml.safe_load(f)
    
attributes  = templates['protected_attr']

In [None]:
NO_MODELS = config['iterations']*len(config['maxP'])*len(config['minP'])

### Evaluate all models

In [None]:
scores = ['SEAT', 'MAC', 'DB', 'RIPA', 'GWEAT', 'cluster', 'neighbor', 'SVM']
corr_per_measure = {'unmask': {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}}
overall_biases = {'model-mean': {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}, 'model-var': {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []},
                  'data-mean': {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}, 'data-var': {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}}
for score in scores:
    corr_per_measure.update({score:{'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}})
    overall_biases.update({score: {'ETHNICITY': [], 'RELIGION': [], 'GENDER': []}})

word_scores =  ['SEAT', 'DB', 'RIPA', 'unmask', 'data']
score_corr = {}
for attr in attributes:
    score_corr.update({attr: {}})
    for score in word_scores:
        score_corr[attr].update({score: {}})
        for score2 in word_scores:
            score_corr[attr][score].update({score2: []})

for test_model_id in range(NO_MODELS):
    test_dir = RESULT_DIR+str(test_model_id)+"/"
    
    train_data_file =  RESULT_DIR+str(test_model_id)+"/train_data.pickle"
    with open(train_data_file, 'rb') as f:
        data = pickle.load(f)
    
    data_bias_file = test_dir + DATA_BIAS_RES
    pretrain_bias_file = test_dir + PRETRAIN_BIAS_RES
    
    df = pd.read_csv(data_bias_file)
    df2 = pd.read_csv(pretrain_bias_file)

    for attr in attributes:
        groups = templates[attr][0]

        p_equal = 1.0/len(groups)
        mean_diff_data = np.mean([np.abs(np.asarray(df.loc[:,group])-np.ones(len(df.loc[:,group]))*p_equal) for group in groups], axis=0)
        mean_diff_model = np.mean([np.abs(np.asarray(df2.loc[:,group])-np.ones(len(df2.loc[:,group]))*p_equal) for group in groups], axis=0)

        dist_equal = [p_equal]*len(groups)
        divergence_data = [scipy.spatial.distance.jensenshannon(dist_equal, list(df.loc[i,groups])) for i in range(len(df))]
        divergence_model = [scipy.spatial.distance.jensenshannon(dist_equal, list(df2.loc[i,groups])) for i in range(len(df2))]

        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(divergence_data, divergence_model)
        model_r_value = r_value
        
        thresh = 0.0
        if model_r_value >= thresh:
            corr_per_measure['unmask'][attr].append(r_value)

            overall_biases['model-mean'][attr].append(np.mean(divergence_model))
            overall_biases['model-var'][attr].append(np.var(divergence_model))
            overall_biases['data-mean'][attr].append(np.mean(divergence_data))
            overall_biases['data-var'][attr].append(np.var(divergence_data))

        for i, score in enumerate(scores):
            score_name_in_dict = score
            if score == 'DB':
                score_name_in_dict = 'DirectBias'
            elif score == 'SVM':
                score_name_in_dict = 'classification'
            elif score == 'SEAT':
                score_name_in_dict = 'WEAT'
            if score_name_in_dict+'_i_bias' in data.keys():
                score_name_in_dict = score_name_in_dict+'_i'
            df_score = data[score_name_in_dict+"_bias"]
            
            if len(groups) == 2:
                # weat and same return signed scores but we compare with absolute values
                score_biases = np.abs(df_score.loc[:,attr])
            else:
                score_biases = df_score.loc[:,attr]
            
            if model_r_value >= thresh:
                slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(divergence_data, score_biases)
                corr_per_measure[score][attr].append(r_value)
            
            # either use abs value or normalize weat scores to [0,1]
            if not type(data['overall_biases'].loc[attr, score_name_in_dict]) == dict:
                overall_biases[score][attr].append(np.abs(data['overall_biases'].loc[attr, score_name_in_dict]))
                
        # score-score correlation
        for i, score in enumerate(word_scores[:-2]):
            score_name_in_dict = score
            if score == 'DB':
                score_name_in_dict = 'DirectBias'
            elif score == 'SVM':
                score_name_in_dict = 'classification'
            if score_name_in_dict+'_i_bias' in data.keys():
                score_name_in_dict = score_name_in_dict+'_i'
            elif score == 'SEAT':
                score_name_in_dict = 'WEAT'
            df_score = data[score_name_in_dict+"_bias"]
            
            if len(groups) == 2:
                # weat and same return signed scores but we compare with absolute values
                score_biases = np.abs(df_score.loc[:,attr])
            else:
                score_biases = df_score.loc[:,attr]
            for j, score2 in enumerate(word_scores[:-1]): # model separately
                if j > i:
                    score_corr[attr][score][score2].append(float("nan"))
                elif j == i:
                    score_corr[attr][score][score2].append(1)
                else:
                    score_name_in_dict2 = score2
                    if score2 == 'DB':
                        score_name_in_dict2 = 'DirectBias'
                    elif score2 == 'SVM':
                        score_name_in_dict2 = 'classification'
                    if score_name_in_dict2+'_i_bias' in data.keys():
                        score_name_in_dict2 = score_name_in_dict2+'_i'
                    elif score2 == 'SEAT':
                        score_name_in_dict2 = 'WEAT'
                    df_score2 = data[score_name_in_dict2+"_bias"]

                    if len(groups) == 2:
                        score2_biases = np.abs(df_score2.loc[:,attr])
                    else:
                        score2_biases = df_score2.loc[:,attr]

                    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(score_biases, score2_biases)
                    score_corr[attr][score][score2].append(r_value)
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(score_biases, divergence_model)
            score_corr[attr]['unmask'][score].append(r_value)
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(score_biases, divergence_data)
            score_corr[attr]['data'][score].append(r_value)
        
        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(divergence_model, divergence_data)
        score_corr[attr]['data']['unmask'].append(r_value)
        score_corr[attr]['unmask']['unmask'].append(1)
        score_corr[attr]['data']['data'].append(1)
            

In [None]:
print(score_corr['GENDER'].keys())
sns.set(rc={'figure.figsize':(10,8)})
for attr in attributes:
    print(attr)
    mean_score_corr = {}
    for score in word_scores:
        mean_score_corr.update({score: {}})
        for score2 in word_scores:
            mean_score_corr[score].update({score2: np.mean(score_corr[attr][score][score2])})

    score_corr_df = pd.DataFrame(data=mean_score_corr)
    
    if attr == 'GENDER':
        hm = sns.heatmap(score_corr_df, fmt=".2f", cmap='crest', annot=True, annot_kws={"fontsize":20})
    else:
        hm = sns.heatmap(score_corr_df.loc['DB':,score_corr_df.columns[1:]], fmt=".2f", cmap='crest', annot=True, annot_kws={"fontsize":20})
    hm.axes.set_title(attr.title(), fontsize=30)#'Word Bias Score Correlations ('+attr+')', fontsize=30)
    #hm.tick_params(labelsize=25)
    hm.tick_params(labelsize=25, rotation=45)
    plt.savefig('plots/word_score_heatmap_'+attr+'.eps', format='eps')
    plt.show()

In [None]:
eval_scores = word_scores[:-1]
width = 0.2
offset = np.asarray([-3*width/2, -width/2, width/2, 3*width/2])
x = np.arange(len(attributes))

fig, ax = plt.subplots(figsize=(12,6))
    
for i, score in enumerate(eval_scores):
    r2_mean = []
    r2_std = []
    
    for attr in attributes:
        r2_mean.append(np.mean(corr_per_measure[score][attr]))
        r2_std.append(np.std(corr_per_measure[score][attr]))
    
    ax.bar(x+offset[i], r2_mean, width, yerr=r2_std, label=score)
ax.set_ylabel('Pearson correlation', fontsize=20)
ax.set_xticks(x, attributes, fontsize=16)
ax.set_ylim(-0.19,1.1)
ax.grid(color='grey', linestyle='--', axis='y')
ax.set_title('Pearson Correlations with data biases', fontsize=25)
ax.legend(loc='upper right', bbox_to_anchor=(0.85, 0.5, 0., 0.5), fontsize=16)
plt.savefig('plots/word_bias_corr.eps', format='eps')
plt.show()

In [None]:
overall_eval_scores = ['cluster', 'SEAT', 'GWEAT', 'DB', 'RIPA', 'SVM', 'neighbor', 'unmask', 'data']
width = 0.1
offset = np.asarray([-7*width/2, -5*width/2, -3*width/2, -width/2, width/2, 3*width/2, 5*width/2, 7*width/2])
x = np.arange(len(attributes))

fig, ax = plt.subplots(figsize=(16,6))
for i, score in enumerate(overall_eval_scores):
    r2s_mean = []
    ps_mean = []
    
    for attr in attributes:
        
        if not score in ['unmask','data']:
            if len(overall_biases[score][attr]) == 0:
                r2s_mean.append(0)
                ps_mean.append(0)
                continue

            if len(overall_biases[score][attr]) != len(overall_biases['data-mean'][attr]):
                r2s_mean.append(0)
                ps_mean.append(0)
                print(score)
                continue
            
        if score == 'unmask':
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(overall_biases['data-mean'][attr], overall_biases['model-mean'][attr])
        elif score != 'data':
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(overall_biases['data-mean'][attr], overall_biases[score][attr])
        r2s_mean.append(r_value)
        ps_mean.append(p_value)
    
    if score == 'data':
        continue
    ax.bar(x+offset[i], r2s_mean, width, yerr=ps_mean, label=score)
    
ax.set_ylabel('Pearson correlation', fontsize=20)
ax.set_xticks(x, attributes, fontsize=16)
ax.set_title('Pearson Correlations with mean data biases', fontsize=25)
#ax.set_ylim(-1.0,1.0)
xlim = ax.get_xlim()
ax.set_xlim(xlim[0]+0.2,xlim[1]+0.5)
ax.grid(color='grey', linestyle='--', axis='y')
ax.legend(loc='upper right', fontsize=16)
plt.savefig('plots/bias_corr.eps', format='eps')
plt.show()

In [None]:
score_corr = {}
for attr in attributes:
    score_corr.update({attr: {}})
    for score in overall_eval_scores:
        score_corr[attr].update({score: {}})
        for score2 in overall_eval_scores:
            score_corr[attr][score].update({score2: 0})
            
for attr in attributes:
    for i, score in enumerate(overall_eval_scores+['data']):
        if score == 'unmask':
            biases = overall_biases['model-mean'][attr]
        elif score == 'data':
            biases = overall_biases['data-mean'][attr]
        else:
            biases = overall_biases[score][attr]

        for j, score2 in enumerate(overall_eval_scores+['data']):
            #print(attr, score, score2)
            if j > i:
                score_corr[attr][score][score2] = float("nan")
            elif j == i:
                score_corr[attr][score][score2] = 1.0
            else:
                if score2 == 'unmask':
                    biases2 = overall_biases['model-mean'][attr]
                elif score2 == 'data':
                    biases2 = overall_biases['data-mean'][attr]
                else:
                    biases2 = overall_biases[score2][attr]

                if len(biases) == 0 or len(biases2) == 0:
                    score_corr[attr][score][score2] = float("nan")
                else:
                    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(biases, biases2)
                    score_corr[attr][score][score2] = r_value

    score_corr_df = pd.DataFrame(data=score_corr[attr])
    
    if attr == 'GENDER':
        hm = sns.heatmap(score_corr_df, fmt=".2f", cmap='crest', annot=True, annot_kws={"size":20})
    else:
        hm = sns.heatmap(score_corr_df.loc['GWEAT':, score_corr_df.columns[2:]], fmt=".2f", cmap='crest', annot=True, annot_kws={"size":20})
    hm.axes.set_title(attr.title(), fontsize=30)
    hm.tick_params(labelsize=25, rotation=45)
    plt.savefig('plots/score_heatmap_'+attr+'.eps', format='eps', bbox_inches='tight')
    plt.show()

## Robustness of word bias scores

In [None]:
word_bias_scores = ['WEAT', 'DirectBias', 'RIPA', 'model JSD', 'data JSD', 'model', 'data']

target_domain = templates['target']
target_words = templates[target_domain]
attributes = templates['protected_attr']

iter_id = 0

std_per_score = {}
for score in word_bias_scores:
    std_per_score.update({score: []})
for minP in config['minP']:
    for maxP in config['maxP']:
        # all models ids that were trained on the same probability distribution created by minP,maxP
        iter_ids = range(iter_id, iter_id+config['iterations'])
        
        for score in word_bias_scores:
            #print(score)
            biases = []
            for model_id in iter_ids:
                test_dir = RESULT_DIR+str(model_id)+"/"
                train_data_file =  RESULT_DIR+str(model_id)+"/train_data.pickle"
                with open(train_data_file, 'rb') as f:
                    data = pickle.load(f)

                data_bias_file = test_dir + DATA_BIAS_RES
                pretrain_bias_file = test_dir + PRETRAIN_BIAS_RES

                if 'model' in score:
                    df = pd.read_csv(data_bias_file)
                    df = df.loc[:,df.columns[1:]] # removing the column with job titles
                elif 'data' in score:
                    df = pd.read_csv(pretrain_bias_file)
                    df = df.loc[:,df.columns[1:]] # removing the column with job titles
                else:
                    df = data[score+'_bias']

                if 'JSD' in score:
                    bias_by_attr_target = {}
                    for attr in attributes:
                        bias_by_attr_target.update({attr: {}})
                        for target in target_words:
                            bias_by_attr_target[attr].update({target: 0})
                    for attr in attributes:
                        groups = templates[attr][0]
                        p_equal = 1.0/len(groups)
                        dist_equal = [p_equal]*len(groups)
                        divergence = [scipy.spatial.distance.jensenshannon(dist_equal, list(df.loc[i,groups])) for i in range(len(df))]
                        #print(divergence)
                        for i, target in enumerate(target_words):
                            bias_by_attr_target[attr][target] = divergence[i]

                    df = pd.DataFrame(data=bias_by_attr_target)
                    
                biases.append(df.to_numpy())
            
            # standard deviation of biases between the 5 models, mean over all targets
            pdiffs = []
            for idx in range(config['iterations']-1):
                pdiffs.append(abs(biases[idx]-biases[0])/biases[0])
            #print(pdiffs)
            #print(biases)
            std_per_score[score].append(np.mean(np.std(pdiffs, axis=0), axis=0))
        
        iter_id = iter_ids[-1]+1 
        
for score in word_bias_scores:
    print(score, np.mean(np.vstack(std_per_score[score]), axis=0))

In [None]:
for attr in attributes:
    count = 0
    for temp in templates['templates_train']:
        if attr in temp:
            count += 1
    print("found", count, "training templates for attr", attr)
    
    count = 0
    for temp in templates['templates_test']:
        if attr in temp:
            count += 1
    print("found", count, "test templates for attr", attr)

In [None]:
print(len(templates['templates_train']))
print(len(templates['templates_test']))

## Permutation test to measure template influence

In [None]:
print(test_model_id)
test_dir = RESULT_DIR+str(test_model_id)+"/"

data_bias_file = test_dir + DATA_BIAS_RES
pretrain_bias_file = test_dir + PRETRAIN_BIAS_RES

df = pd.read_csv(data_bias_file)
df2 = pd.read_csv(pretrain_bias_file)


In [None]:
def unmasking_bias(unmasker, masked_sent, group_tokens):
    result = unmasker(masked_sent, targets=group_tokens, top_k=len(group_tokens))

    prob = 0
    for res in result:
        prob += res['score']

    probs = []
    for token in group_tokens:
        for res in result:
            if res['token_str'] == token:
                probs.append(res['score'] / prob)
    return probs

def unmasking_bias_multi_attr(bert, template_config, target_words):
    count = 0
    probabilities = []
    masked_sentences = []
    attr_label = []
    target_label = []
    attributes = tmp['protected_attr']
    templates = tmp['templates_test']

    group_token_by_attr = {}
    attr_results = {}
    for attr in attributes:
        group_token_by_attr.update({attr: []})
        attr_results.update({attr: {}})
        for i in range(len(template_config[attr])):
            group_token_by_attr[attr].append(template_config[attr][i])

    probs_by_target_group = {}
    for group in groups:
        probs_by_target_group.update({group: {}})
        for target in target_words:
            probs_by_target_group[group].update({target: []})

    if torch.cuda.is_available():
        unmasker = pipeline('fill-mask', model=bert.model, tokenizer=bert.tokenizer, device=0)
    else:
        unmasker = pipeline('fill-mask', model=bert.model, tokenizer=bert.tokenizer, device=-1)

    for temp in templates:
        for attr in attributes:
            # count back in case there are more than 10 versions of this attribute (e.g. GENDER10 contains GENDER1)
            sent = temp

            # replace all other attributes with the neutral term
            for attr2 in attributes:
                if attr2 == attr:
                    continue
                for i in range(len(template_config[attr2]) - 1, -1, -1):
                    cur_attr = attr2 + str(i)
                    sent = sent.replace(cur_attr, template_config[attr2 + '_neutral'][i])

            # now insert the mask for the targeted attribute
            for i in range(len(template_config[attr]) - 1, -1, -1):
                cur_attr = attr + str(i)
                if cur_attr not in sent:
                    continue
                sent2 = sent

                sent2 = sent2.replace(cur_attr, '[MASK]')
                # in case there are multiple words defining this attribute, replace others with the neutral term
                for j in range(len(template_config[attr]) - 1, -1, -1):
                    if not j == i:
                        sent2 = sent2.replace(attr + str(j), template_config[attr + '_neutral'][j])

                # replace target and obtain unmasking probabilities for each group per target
                for target in target_words:
                    masked_sent = sent2.replace(template_config['target'], target)

                    if not masked_sent.count('[MASK]') == 1:
                        print("zero or mulitple masks in sentence!")
                        print(masked_sent)
                        print(sent)
                        print(cur_attr)
                    probs = unmasking_bias(unmasker, masked_sent, group_token_by_attr[attr][i])
                    masked_sentences.append(masked_sent)
                    attr_label.append(attr)
                    target_label.append(target)
                    probabilities.append(probs)

                # if there are other versions of this attribute, this will be replaced with the neutral term anyways
                sent = sent.replace(cur_attr, template_config[attr + '_neutral'][i])

    probs_by_attr_target = {}
    for attr in attributes:
        probs_by_attr_target.update({attr: {}})
        for target in target_words:
            probs_by_attr_target[attr].update({target: []})
            
    sent_by_attr_target = {}
    for attr in attributes:
        sent_by_attr_target.update({attr: {}})
        for target in target_words:
            sent_by_attr_target[attr].update({target: []})
            
    for i, prob in enumerate(probabilities):
        probs_by_attr_target[attr_label[i]][target_label[i]].append(prob)
        sent_by_attr_target[attr_label[i]][target_label[i]].append(masked_sentences[i])
        
    return probs_by_attr_target, sent_by_attr_target

In [None]:
n_iter = 1000

def template_permutation_test(bert, tmp, target_words, n_iter=1000):
    
    protected_attributes = tmp['protected_attr']
    groups = []
    for attr in protected_attributes:
        for group in tmp[attr][0]:
            groups.append(group)
    
    protected_groups = {}
    for attr in protected_attributes:
        protected_groups.update({attr: tmp[attr][0]})
    
    probs_by_attr_target, sent_by_attr_target = unmasking_bias_multi_attr(bert, tmp, target_words)
    
    emb_by_attr_target = {}
    for attr in attributes:
        emb_by_attr_target.update({attr: {}})
        for target in target_words:
            emb_by_attr_target[attr].update({target: bert.embed(sent_by_attr_target[attr][target])})    
    
    mean_probs_by_attr_target = {}
    std_probs_by_attr_target = {}
    jsd_by_attr_target = {}
    stds = []
    for attr in probs_by_attr_target.keys():
        mean_probs_by_attr_target.update({attr: {}})
        std_probs_by_attr_target.update({attr: {}})
        jsd_by_attr_target.update({attr: {}})

        n_groups = len(probs_by_attr_target[attr][target_words[0]][0])
        dist_equal = [1.0/n_groups]*n_groups

        for target in target_words:
            mean_probs_by_attr_target[attr].update({target: np.mean(probs_by_attr_target[attr][target], axis=0)})
            std_probs_by_attr_target[attr].update({target: np.std(probs_by_attr_target[attr][target], axis=0)})
            jsd_by_attr_target[attr].update({target: scipy.spatial.distance.jensenshannon(dist_equal, np.mean(probs_by_attr_target[attr][target], axis=0))})
            stds += list(np.std(probs_by_attr_target[attr][target], axis=0))
    
    
    attr_results = dict(zip(attributes,[{'pval': 0, 'pdiff': 0, 'jsd_diff:': 0, 'mean_std': np.mean(stds)} for attr in attributes]))
    
    for attr in attributes:
        pdiff = []
        jsd_diff = []
        n = len(probs_by_attr_target[attr][target_words[0]])
        n_groups = len(probs_by_attr_target[attr][target_words[0]][0])
        dist_equal = [1.0/n_groups]*n_groups
        n_samples = int(9*n/10)

        p_val = 0
        for it in range(n_iter):
            idxs = np.random.permutation(n)

            for target in target_words:
                # unmasking biases
                pat = [probs_by_attr_target[attr][target][i] for i in range(n) if i in idxs[:n_samples]]
                mean_pat_it = np.mean(pat, axis=0)

                mean_pat = mean_probs_by_attr_target[attr][target]
                if not np.array_equal(np.argsort(mean_pat), np.argsort(mean_pat_it)):
                    #print(np.argsort(mean_pat), np.argsort(mean_pat_it))
                    p_val += 1
                for i in range(len(mean_pat_it)):
                    pdiff.append(np.abs(mean_pat_it[i]-mean_pat[i])/mean_pat[i])

                jsd_it = scipy.spatial.distance.jensenshannon(dist_equal, mean_pat_it)
                jsd_diff.append(np.abs(jsd_it-jsd_by_attr_target[attr][target])/jsd_by_attr_target[attr][target])
                

        p_val /= n_iter*len(target_words)
        attr_results[attr]['pval'] = p_val
        attr_results[attr]['pdiff'] = pdiff
        attr_results[attr]['jsd_diff'] = jsd_diff

    return attr_results

In [None]:
with open(BASE_DIR+config['template_file'], 'r') as f:
    tmp = yaml.safe_load(f)
    
target_domain = tmp['target']
target_words = tmp[target_domain]
bert = BertHuggingfaceMLM(model_name=config['pretrained_model'], batch_size=config['batch_size'])

results = []
overall_bias_pdiffs = []
word_bias_pdifffs = []
for test_model_id in range(NO_MODELS):
    print("############ At Model ", test_model_id, "############")
    print()
    test_dir = RESULT_DIR+str(test_model_id)+"/"

    train_data_file =  RESULT_DIR+str(test_model_id)+"/train_data.pickle"
    model_path =  RESULT_DIR+str(test_model_id)+"/model/"
    with open(train_data_file, 'rb') as f:
        data = pickle.load(f)
    
    bert.load(model_path)
    
    attr_results = template_permutation_test(bert, tmp, target_words, n_iter=n_iter)
    print(attr_results)
    results.append(attr_results)
    
    print()
    print()

In [None]:
for attr in attributes:
    pvals = []
    for res in results:
        pvals.append(res[attr]['pval'])
    print(attr, np.mean(pvals), np.std(pvals))