In [198]:
import os
import sys
import json
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd
import functools
import numpy as np
import math
import random
import scipy
import yaml

sys.path.append(os.path.abspath(".."))
import plotting
import utils

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
from data_loader import get_dataset, CustomDataset

## Load results

In [200]:
RESULTS_FILE = '../results/concept_eval/exp1_cv.csv'
RESULTS_FILE2 = '../results/concept_eval/exp2_cv.csv'
TRANSFER_LIN_FILE = '../results/concept_eval/exp_transfer_lin.csv'
TRANSFER_MLP_FILE = '../results/concept_eval/exp_transfer_mlp.csv'
PROCESSED_TRANSFER_FILE = '../results/concept_eval/prep_results_transfer.csv'

PROCESSED_RESULTS_FILE = '../results/concept_eval/prep_results.csv'

In [201]:
df = pd.read_csv(RESULTS_FILE)
df2 = pd.read_csv(RESULTS_FILE2)
df = pd.concat([df, df2])
df

Unnamed: 0,dataset,model,model type,architecture,pooling,classifier,clf hidden size factor,emb size,optimizer,lr,loss,group,Pearson R,pvalue,PR-AUC,Epochs
0,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,mean,0.218510,0.260668,0.178546,"[18, 26, 28, 29]"
1,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,weighted mean,0.166886,0.224006,0.113047,"[18, 26, 28, 29]"
2,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,mean (p < 0.05),0.329735,0.000078,0.269221,"[18, 26, 28, 29]"
3,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,white,0.130812,0.228754,0.053363,"[18, 26, 28, 29]"
4,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,black,0.676848,0.000000,0.731941,"[18, 26, 28, 29]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8198,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,mean,0.765516,0.000000,0.942517,"[12, 10, 17, 10]"
8199,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,weighted mean,0.765516,0.000000,0.942517,"[12, 10, 17, 10]"
8200,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,mean (p < 0.05),0.765516,0.000000,0.942517,"[12, 10, 17, 10]"
8201,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,aa,0.772165,0.000000,0.947267,"[12, 10, 17, 10]"


In [202]:
df_transfer = pd.read_csv(TRANSFER_LIN_FILE)
df_transfer2 = pd.read_csv(TRANSFER_MLP_FILE)
df_transfer = pd.concat([df_transfer, df_transfer2])
df_transfer

Unnamed: 0,dataset train,dataset test,model,model type,architecture,pooling,classifier,clf hidden size factor,emb size,optimizer,lr,loss,group,PR-AUC
0,twitterAAE,twitterAAE,microsoft/deberta-v3-large,deberta-v2,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,mean,0.922259
1,twitterAAE,twitterAAE,microsoft/deberta-v3-large,deberta-v2,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,aa,0.928535
2,twitterAAE,twitterAAE,microsoft/deberta-v3-large,deberta-v2,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,white,0.915982
3,twitterAAE,implicit_hate,microsoft/deberta-v3-large,deberta-v2,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,mean,0.118734
4,twitterAAE,implicit_hate,microsoft/deberta-v3-large,deberta-v2,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,white,0.204196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6445,sbic,stereoset,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,male,0.275098
6446,sbic,stereoset,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,female,0.425631
6447,sbic,stereoset,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,christian,0.130200
6448,sbic,stereoset,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,muslim,0.261707


## Utils

In [203]:
def aggregate_results(results, sel_cols, target_col):
    grouped_res = results.groupby(sel_cols, as_index=False)[target_col].mean().reset_index()
    grouped_res[target_col+' var'] = results.groupby(sel_cols, as_index=False)[target_col].var()[target_col]
    return grouped_res

## Preprocess results

#### 1) Add protected attribute and standardized group labels + replace nans in pooling column

In [204]:
df.loc[pd.isna(df['pooling']),'pooling'] = 'unknown'
df_transfer.loc[pd.isna(df_transfer['pooling']),'pooling'] = 'unknown'

In [205]:
# add protected attribute
LBL_CONFIG = '../configs/concept_transfer/label_matches_complete.yaml'
with open(LBL_CONFIG, 'r') as ff:
    label_match_config = yaml.safe_load(ff)

group_to_standardized_lbl = {grp: group for attr, lookup in label_match_config.items() for group, labels in lookup.items() for grp in labels}
group_to_attr = {grp: attr for attr, lookup in label_match_config.items() for group, labels in lookup.items() for grp in labels}

df['attribute'] = df['group'].map(group_to_attr)
df['group_standardized'] = df['group'].map(group_to_standardized_lbl)

df_transfer['attribute'] = df_transfer['group'].map(group_to_attr)
df_transfer['group_standardized'] = df_transfer['group'].map(group_to_standardized_lbl)

df

Unnamed: 0,dataset,model,model type,architecture,pooling,classifier,clf hidden size factor,emb size,optimizer,lr,loss,group,Pearson R,pvalue,PR-AUC,Epochs,attribute,group_standardized
0,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,mean,0.218510,0.260668,0.178546,"[18, 26, 28, 29]",,
1,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,weighted mean,0.166886,0.224006,0.113047,"[18, 26, 28, 29]",,
2,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,mean (p < 0.05),0.329735,0.000078,0.269221,"[18, 26, 28, 29]",,
3,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,white,0.130812,0.228754,0.053363,"[18, 26, 28, 29]",ethnicity,white
4,sbic,bert-base-uncased,bert,encoder,mean,linear,-1.0,768,Salsa,,BCEWithLogitsLoss,black,0.676848,0.000000,0.731941,"[18, 26, 28, 29]",ethnicity,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8198,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,mean,0.765516,0.000000,0.942517,"[12, 10, 17, 10]",,
8199,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,weighted mean,0.765516,0.000000,0.942517,"[12, 10, 17, 10]",,
8200,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,mean (p < 0.05),0.765516,0.000000,0.942517,"[12, 10, 17, 10]",,
8201,twitterAAE,deepseek-ai/deepseek-llm-7b-chat,deepseek,encoder,mean,MLP2,1.0,4096,Salsa,,BCEWithLogitsLoss,aa,0.772165,0.000000,0.947267,"[12, 10, 17, 10]",ethnicity,black


#### 2) Add number of samples/ group ratio to results

In [206]:
def get_dataset_stats(ds: CustomDataset):
    # protected groups
    group_dist = {'n samples': {group: 0 for group in ds.group_names} for split in ds.splits}
    group_df = pd.DataFrame(group_dist)

    split = 'train' if 'train' in ds.splits else ds.splits[0]
    for i, group in enumerate(ds.group_names):
        group_df.loc[group, 'n samples'] = np.sum(ds.protected_groups[split][:,i])
    total = ds.n_samples[split]
    group_df['ratio'] = group_df['n samples']*100/total
    return group_df

In [207]:
local_dirs = {'bios-supervised': '../../../../data/bios_huggingface_merge.pkl',
              'jigsaw': '../../../../data/jigsaw_unintended_bias',
              'sbic': '../../../../data/filtered_sbic_minority_overview.csv',
              'twitterAAE': None,
              'implicit_hate': '../../../../data/implicit-hate-corpus/',
              'winoqueer': '../../../../data/winoqueer_final.csv',
              'crows_pairs': None,
              'stereoset': None
             }

df['group ratio'] = 0

for dataset in set(df['dataset']):
    ds = get_dataset(dataset, local_dirs[dataset])
    for split in ds.splits:
        print(split, ds.labels[split].shape)
    grp_df = get_dataset_stats(ds)
    for grp in grp_df.index:
        df.loc[((df['dataset'] == dataset) & (df['group'] == grp)), 'group samples'] = grp_df.loc[grp, 'n samples']
        df.loc[((df['dataset'] == dataset) & (df['group'] == grp)), 'group ratio'] = grp_df.loc[grp, 'ratio']

load crowspairs
compute class weights for split test
test (3016, 1)
load twitterAAE


  df.loc[((df['dataset'] == dataset) & (df['group'] == grp)), 'group ratio'] = grp_df.loc[grp, 'ratio']


compute class weights for split test
test (100000, 1)
load JigsawBias with option: single-class
compute class weights for split train
compute class weights for split dev
compute class weights for split test
train (357019, 2)
dev (18876, 2)
test (19042, 2)
Loading Implicit Hate dataset with option all from: ../../../../data/implicit-hate-corpus/
compute class weights for split test
test (21480, 10)
load BIOS with option supervised
compute class weights for split train
compute class weights for split test
compute class weights for split dev
train (7017, 10)
test (2500, 10)
dev (1046, 10)
load Stereoset with option both
Load inter- and intrasentence samples and merge them to one dataset
compute class weights for split val
val (12687, 1)
load winoqueer
compute class weights for split test
test (91080, 1)
load SBIC with local file: ../../../../data/filtered_sbic_minority_overview.csv
compute class weights for split train
compute class weights for split test
compute class weights for split d

In [208]:
# replace dataset names with abbreviations
lookup = {
    "bios-supervised": "BIOS",
    "bios_sup": "BIOS",
    "crows_pairs": "CrowSPairs",
    "crowspairs": "CrowSPairs",
    "implicit_hate": "ImplicitHate",
    "jigsaw": "Jigsaw",
    "sbic": "SBIC",
    "stereoset": "StereoSet",
    "twitterAAE": "TwitterAAE",
    "winoqueer": "WinoQueer"
}

# Option 1: using replace
df["dataset"] = df["dataset"].replace(lookup)
df_transfer["dataset train"] = df_transfer["dataset train"].replace(lookup)
df_transfer["dataset test"] = df_transfer["dataset test"].replace(lookup)

In [209]:
# replace model family names with abbreviations
lookup = {
    "text-embedding-3": "text-emb.",
    "deberta-v2": "deberta"
}

# Option 1: using replace
df["model type"] = df["model type"].replace(lookup)
df_transfer["model type"] = df_transfer["model type"].replace(lookup)

#### 3) Compare success rates of Classifiers, Datasets and Models/ Pooling Methods

In [210]:
clfs = {key: {'failed': 0, 'succeded': 0} for key in set(df['classifier'])}
datasets = {key: {'failed': 0, 'succeded': 0} for key in set(df['dataset'])}
poolings = {key: {'failed': 0, 'succeded': 0} for key in set(df['pooling'])}# if not pd.isna(key)} # TODO distinguish models/ placeholder for text-embedding models instead of nan
models = {key: {pooling: {'failed': 0, 'succeded': 0} for pooling in poolings.keys()} for key in set(df['model'])}

for model in models.keys():
    count = 0
    for clf in clfs.keys():
        for dataset in datasets.keys():
            cur_poolings = ['unknown'] if 'text-embedding' in model else ['mean', 'cls']
            for pooling in cur_poolings:
                count += 1
                df_sel = df[(df['dataset'] == dataset) & (df['model'] == model) & (df['classifier'] == clf) & (df['pooling'] == pooling)]
                if len(df_sel) == 0:
                    clfs[clf]['failed'] += 1
                    poolings[pooling]['failed'] += 1
                    models[model][pooling]['failed'] += 1
                    datasets[dataset]['failed'] += 1    
                else:
                    clfs[clf]['succeded'] += 1
                    poolings[pooling]['succeded'] += 1
                    models[model][pooling]['succeded'] += 1
                    datasets[dataset]['succeded'] += 1
    #print(model, count)

model_rates = {'mean': {}, 'cls': {}}
for k,v in models.items():
    if 'text-embedding' in k:
        ratio = v['unknown']['succeded']/(v['unknown']['succeded']+v['unknown']['failed'])
        #print("%s\t\t%.2f" % (k, ratio))
        model_rates['mean'][k] = ratio
    else:
        mean_ratio = v['mean']['succeded']/(v['mean']['succeded']+v['mean']['failed'])
        cls_ratio = v['cls']['succeded']/(v['cls']['succeded']+v['cls']['failed'])
        #print("%s\t\tmean: %.2f\tcls: %.2f" % (k, mean_ratio, cls_ratio))
        model_rates['mean'][k] = mean_ratio
        model_rates['cls'][k] = cls_ratio

print("clf success rates:")
for clf, v in clfs.items():
    ratio = v['succeded']/(v['succeded']+v['failed'])
    print("%s\t\t%.2f" % (clf, ratio))

print("\ndataset success rates:")
for k, v in datasets.items():
    ratio = v['succeded']/(v['succeded']+v['failed'])
    print("%s\t\t%.2f" % (k, ratio))

print("\nmodel success rates:")
pd.DataFrame(model_rates)


clf success rates:
MLP2		0.74
linear		0.81

dataset success rates:
SBIC		0.71
BIOS		0.96
StereoSet		0.95
ImplicitHate		0.70
Jigsaw		0.75
CrowSPairs		0.96
TwitterAAE		0.38
WinoQueer		0.77

model success rates:


Unnamed: 0,mean,cls
EleutherAI/pythia-1b,0.875,0.5
albert-base-v2,0.8125,0.8125
meta-llama/Llama-3.2-1B-Instruct,1.0,0.9375
microsoft/deberta-v3-base,0.875,0.75
distilbert-base-uncased,0.8125,0.75
bert-large-uncased,0.8125,0.75
bert-base-uncased,0.875,0.8125
EleutherAI/pythia-160m,0.5,0.375
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,1.0,0.625
roberta-base,0.875,0.75


#### 4) Evaluate average PR-AUC and number of statistical significant correlations per model and pooling method

In [211]:
df_groups = df[(df['group'] != 'mean') & (df['group'] != 'weighted mean') & (df['group'] != 'mean (p < 0.05)')]

In [212]:
models = set(df['model'])

# get best pooling method per model
for model in models:
    if 'text-embedding' in model:
        continue
    df_mean = df_groups[(df_groups['model'] == model) & (df_groups['pooling'] == 'mean')]
    df_cls = df_groups[(df_groups['model'] == model) & (df_groups['pooling'] == 'cls')]

    agg_mean = aggregate_results(df_mean, sel_cols=['dataset', 'classifier'], target_col='PR-AUC')
    agg_cls = aggregate_results(df_cls, sel_cols=['dataset', 'classifier'], target_col='PR-AUC')
    auc_mean = np.mean(agg_mean['PR-AUC'])
    auc_cls = np.mean(agg_cls['PR-AUC'])
    n_sign_mean = (df_mean['pvalue'] < 0.01).sum()
    n_sign_cls = (df_cls['pvalue'] < 0.01).sum()

    print(model)
    print("mean: %i results, %i correlations statistical significant, mean AUC: %.3f" % (len(df_mean), n_sign_mean, auc_mean))
    print("cls: %i results, %i correlations statistical significant, mean AUC: %.3f" % (len(df_cls), n_sign_cls, auc_cls))

    if (n_sign_mean > n_sign_cls and auc_mean > auc_cls):
        print("-> mean")
    elif (n_sign_mean < n_sign_cls and auc_mean < auc_cls):
        print("-> cls")
    print()


EleutherAI/pythia-1b
mean: 352 results, 290 correlations statistical significant, mean AUC: 0.625
cls: 194 results, 87 correlations statistical significant, mean AUC: 0.566
-> mean

albert-base-v2
mean: 277 results, 187 correlations statistical significant, mean AUC: 0.571
cls: 333 results, 155 correlations statistical significant, mean AUC: 0.454
-> mean

meta-llama/Llama-3.2-1B-Instruct
mean: 356 results, 264 correlations statistical significant, mean AUC: 0.660
cls: 337 results, 216 correlations statistical significant, mean AUC: 0.587
-> mean

microsoft/deberta-v3-base
mean: 318 results, 161 correlations statistical significant, mean AUC: 0.546
cls: 224 results, 43 correlations statistical significant, mean AUC: 0.418
-> mean

distilbert-base-uncased
mean: 277 results, 211 correlations statistical significant, mean AUC: 0.630
cls: 258 results, 158 correlations statistical significant, mean AUC: 0.639

bert-large-uncased
mean: 277 results, 196 correlations statistical significant, m

#### 5) Filter for models and pooling method

Exclude blacklisted model(s) and pick the better pooling method (mean) per model. Remove unused mean results and save preprocessed results.

In [213]:
# in the following we only look into the best performing pooling method per model
pooling_per_model = {'text-embedding-3-small': 'unknown',
                     'text-embedding-3-large': 'unknown'
                    }

model_blacklist = ["EleutherAI/pythia-160m"]

for model in models:
    if model not in pooling_per_model.keys() and model not in model_blacklist:
        pooling_per_model[model] = 'mean'
pooling_per_model

{'text-embedding-3-small': 'unknown',
 'text-embedding-3-large': 'unknown',
 'EleutherAI/pythia-1b': 'mean',
 'albert-base-v2': 'mean',
 'meta-llama/Llama-3.2-1B-Instruct': 'mean',
 'microsoft/deberta-v3-base': 'mean',
 'distilbert-base-uncased': 'mean',
 'bert-large-uncased': 'mean',
 'bert-base-uncased': 'mean',
 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B': 'mean',
 'roberta-base': 'mean',
 'distilroberta-base': 'mean',
 'facebook/opt-1.3b': 'mean',
 'xlnet/xlnet-base-cased': 'mean',
 'EleutherAI/pythia-410m': 'mean',
 'microsoft/deberta-v3-large': 'mean',
 'meta-llama/Llama-3.2-3B': 'mean',
 'facebook/opt-125m': 'mean',
 'albert-large-v2': 'mean',
 'microsoft/deberta-v3-small': 'mean',
 'EleutherAI/pythia-1.4b': 'mean',
 'deepseek-ai/deepseek-llm-7b-base': 'mean',
 'meta-llama/Llama-3.2-3B-Instruct': 'mean',
 'xlnet/xlnet-large-cased': 'mean',
 'deepseek-ai/deepseek-llm-7b-chat': 'mean',
 'roberta-large': 'mean',
 'meta-llama/Llama-3.2-1B': 'mean'}

In [214]:
dfs = []
for model, pooling in pooling_per_model.items():
    df_model = df[(df['model'] == model) & (df['pooling'] == pooling)]
    dfs.append(df_model)
df_sel = pd.concat(dfs, ignore_index=True)

df_sel = df_sel[(df_sel['group'] != 'weighted mean') & (df_sel['group'] != 'mean (p < 0.05)')]
df_sel.to_csv(PROCESSED_RESULTS_FILE)

df_sel

Unnamed: 0,dataset,model,model type,architecture,pooling,classifier,clf hidden size factor,emb size,optimizer,lr,loss,group,Pearson R,pvalue,PR-AUC,Epochs,attribute,group_standardized,group ratio,group samples
0,SBIC,text-embedding-3-small,text-emb.,embedder,unknown,linear,-1.0,1536,Salsa,,BCEWithLogitsLoss,mean,0.353655,1.626997e-01,0.314080,"[10, 10, 10, 10]",,,0.000000,
3,SBIC,text-embedding-3-small,text-emb.,embedder,unknown,linear,-1.0,1536,Salsa,,BCEWithLogitsLoss,white,0.317888,4.304603e-236,0.193677,"[10, 10, 10, 10]",ethnicity,white,0.333955,120.0
4,SBIC,text-embedding-3-small,text-emb.,embedder,unknown,linear,-1.0,1536,Salsa,,BCEWithLogitsLoss,black,0.767017,0.000000e+00,0.855652,"[10, 10, 10, 10]",ethnicity,black,8.312693,2987.0
5,SBIC,text-embedding-3-small,text-emb.,embedder,unknown,linear,-1.0,1536,Salsa,,BCEWithLogitsLoss,asian,0.502718,0.000000e+00,0.497592,"[10, 10, 10, 10]",ethnicity,asian,1.775527,638.0
6,SBIC,text-embedding-3-small,text-emb.,embedder,unknown,linear,-1.0,1536,Salsa,,BCEWithLogitsLoss,non-white,0.762215,0.000000e+00,0.851496,"[10, 10, 10, 10]",ethnicity,non_white,8.479670,3047.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9917,WinoQueer,meta-llama/Llama-3.2-1B,llama,decoder,mean,MLP2,1.0,2048,Salsa,,BCEWithLogitsLoss,Gay,1.000000,0.000000e+00,1.000000,"[10, 10, 10, 10]",gender_sexual_orientation,homosexual,5.935441,5406.0
9918,WinoQueer,meta-llama/Llama-3.2-1B,llama,decoder,mean,MLP2,1.0,2048,Salsa,,BCEWithLogitsLoss,Heterosexual,1.000000,0.000000e+00,1.000000,"[10, 10, 10, 10]",gender_sexual_orientation,heterosexual,16.214317,14768.0
9919,WinoQueer,meta-llama/Llama-3.2-1B,llama,decoder,mean,MLP2,1.0,2048,Salsa,,BCEWithLogitsLoss,Straight,0.999999,0.000000e+00,1.000000,"[10, 10, 10, 10]",gender_sexual_orientation,heterosexual,16.214317,14768.0
9920,WinoQueer,meta-llama/Llama-3.2-1B,llama,decoder,mean,MLP2,1.0,2048,Salsa,,BCEWithLogitsLoss,Cis,0.999994,0.000000e+00,1.000000,"[10, 10, 10, 10]",gender_sexual_orientation,cis,8.785683,8002.0


### Transfer experiment: Check that results exist

In [215]:
df_lin = df_transfer[df_transfer['classifier'] == 'linear']
df_mlp = df_transfer[df_transfer['classifier'] == 'MLP2']

print("got %i results for linear" % len(df_lin))
print("got %i results for MLP" % len(df_mlp))

got 5157 results for linear
got 6450 results for MLP


In [216]:
datasets = list(set(df_transfer['dataset test']))
print(datasets)
models = list(set(df_transfer['model']))
print(models)

finished = []
incomplete = []
for dtrain in datasets:
    for dtest in datasets:
        df_lin_ = df_lin[(df_lin['dataset train'] == dtrain) & (df_lin['dataset test'] == dtest)].groupby(["model"], as_index=False)["PR-AUC"].mean()
        df_mlp_ = df_mlp[(df_mlp['dataset train'] == dtrain) & (df_mlp['dataset test'] == dtest)].groupby(["model"], as_index=False)["PR-AUC"].mean()
        #agg_res = res.groupby(["dataset test", "dataset train"], as_index=False)[["mlp", "linear", "diff"]].mean()
        if len(df_lin_) == len(models) and len(df_mlp_) == len(models):
            finished.append((dtrain,dtest))
        else:
            #incomplete.append({'linear': 
            print("missing:   %s \t-> %s got %i MLP and %i linear reults" % (dtrain, dtest, len(df_lin_), len(df_mlp_)))

['SBIC', 'BIOS', 'StereoSet', 'ImplicitHate', 'Jigsaw', 'CrowSPairs', 'TwitterAAE', 'WinoQueer']
['deepseek-ai/deepseek-llm-7b-base', 'albert-large-v2', 'xlnet/xlnet-large-cased', 'deepseek-ai/deepseek-llm-7b-chat', 'albert-base-v2', 'EleutherAI/pythia-410m', 'roberta-large', 'microsoft/deberta-v3-large', 'meta-llama/Llama-3.2-1B']
missing:   SBIC 	-> StereoSet got 9 MLP and 8 linear reults
missing:   SBIC 	-> ImplicitHate got 9 MLP and 8 linear reults
missing:   SBIC 	-> Jigsaw got 9 MLP and 8 linear reults
missing:   SBIC 	-> TwitterAAE got 9 MLP and 8 linear reults
missing:   SBIC 	-> WinoQueer got 9 MLP and 8 linear reults
missing:   BIOS 	-> TwitterAAE got 0 MLP and 0 linear reults
missing:   BIOS 	-> WinoQueer got 0 MLP and 0 linear reults
missing:   StereoSet 	-> WinoQueer got 0 MLP and 0 linear reults
missing:   TwitterAAE 	-> SBIC got 6 MLP and 5 linear reults
missing:   TwitterAAE 	-> BIOS got 0 MLP and 0 linear reults
missing:   TwitterAAE 	-> StereoSet got 6 MLP and 5 linea

Most of these cases (0 results) are expected, because no groups are shared. We miss one model for SBIC and several ones for TwitterAAE.
TwitterAAE is removed from transfer results, because the transfer is not meaningful anyway (classifiers do not outperform the random guessing baseline).

In [226]:
# remove TwitterAAE (no meaningful transfer - cannot draw conclusions for our experiment)
df_transfer = df_transfer[(df_transfer['dataset train'] != 'TwitterAAE') & (df_transfer['dataset test'] != 'TwitterAAE')]

In [227]:
copy = df_transfer.copy()

In [228]:
# for some reason we have duplicates -> just merge by mean
columns = ['dataset train', 'dataset test', 'model', 'classifier', 'group']
duplicates = df_transfer[df_transfer.duplicated(subset=columns, keep=False)]
#duplicates

score_col = "PR-AUC"
cols = [c for c in df_transfer.columns if c not in score_col]
df_transfer2 = df_transfer.groupby(columns, as_index=False)[score_col].mean()
df_transfer2

Unnamed: 0,dataset train,dataset test,model,classifier,group,PR-AUC
0,BIOS,BIOS,EleutherAI/pythia-410m,MLP2,female,0.998996
1,BIOS,BIOS,EleutherAI/pythia-410m,MLP2,male,0.999147
2,BIOS,BIOS,EleutherAI/pythia-410m,MLP2,mean,0.999072
3,BIOS,BIOS,EleutherAI/pythia-410m,linear,female,0.999059
4,BIOS,BIOS,EleutherAI/pythia-410m,linear,male,0.999241
...,...,...,...,...,...,...
9284,WinoQueer,WinoQueer,xlnet/xlnet-large-cased,linear,Pansexual,0.979868
9285,WinoQueer,WinoQueer,xlnet/xlnet-large-cased,linear,Queer,0.997719
9286,WinoQueer,WinoQueer,xlnet/xlnet-large-cased,linear,Straight,0.998806
9287,WinoQueer,WinoQueer,xlnet/xlnet-large-cased,linear,Transgender,0.989914


In [229]:
df_transfer.to_csv(PROCESSED_TRANSFER_FILE)

In [230]:
df_transfer

Unnamed: 0,dataset train,dataset test,model,model type,architecture,pooling,classifier,clf hidden size factor,emb size,optimizer,lr,loss,group,PR-AUC,attribute,group_standardized
18,ImplicitHate,ImplicitHate,microsoft/deberta-v3-large,deberta,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,mean,0.096274,,
19,ImplicitHate,ImplicitHate,microsoft/deberta-v3-large,deberta,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,female,0.014929,gender_sexual_orientation,female
20,ImplicitHate,ImplicitHate,microsoft/deberta-v3-large,deberta,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,male,0.012225,gender_sexual_orientation,male
21,ImplicitHate,ImplicitHate,microsoft/deberta-v3-large,deberta,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,homosexual,0.013935,gender_sexual_orientation,homosexual
22,ImplicitHate,ImplicitHate,microsoft/deberta-v3-large,deberta,encoder,mean,linear,-1.0,1024,Salsa,,BCEWithLogitsLoss,lgbtq+,0.002713,gender_sexual_orientation,lgbtq
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6445,SBIC,StereoSet,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,male,0.275098,gender_sexual_orientation,male
6446,SBIC,StereoSet,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,female,0.425631,gender_sexual_orientation,female
6447,SBIC,StereoSet,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,christian,0.130200,religion,christian
6448,SBIC,StereoSet,xlnet/xlnet-large-cased,xlnet,decoder,mean,MLP2,1.0,1024,Salsa,,BCEWithLogitsLoss,muslim,0.261707,religion,muslim
