In [2]:
from datasets import load_dataset

ds = load_dataset("LabHC/bias_in_bios")

ds

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 257478
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 99069
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender'],
        num_rows: 39642
    })
})

In [3]:
import pandas as pd
df_train = ds['train'].to_pandas()
df_train

Unnamed: 0,hard_text,profession,gender
0,He is also the project lead of and major contr...,21,0
1,"She is able to assess, diagnose and treat mino...",13,1
2,"Prior to law school, Brittni graduated magna c...",2,1
3,He regularly contributes to India’s First Onli...,11,0
4,He completed his medical degree at Northwester...,21,0
...,...,...,...
257473,"She photographs advertising images, people in ...",18,1
257474,He is also a graduate of Communications and Me...,24,0
257475,She says one of the first steps in stopping ma...,21,1
257476,He has been photographing for twenty years and...,18,0


In [4]:
counts = pd.crosstab(df_train["profession"], df_train["gender"])
counts.columns = ["male_count", "female_count"]

counts["total"]      = counts["male_count"] + counts["female_count"]
counts["male_pct"]   = counts["male_count"]   / counts["total"]
counts["female_pct"] = counts["female_count"] / counts["total"]

counts["SPD"] = counts["female_pct"] - counts["male_pct"]

print("Top 10 male-dominated professions:")
print(counts.nsmallest(10, "SPD")[["male_count","female_count","SPD"]])
print("\nTop 10 female-dominated professions:")
print(counts.nlargest(10, "SPD")[["male_count","female_count","SPD"]])

Top 10 male-dominated professions:
            male_count  female_count       SPD
profession                                    
23                 823            88 -0.806806
8                  828           136 -0.717842
25                7521          1308 -0.703704
24                3783           709 -0.684328
5                 3042           595 -0.672807
4                 1439           385 -0.577851
1                 5014          1554 -0.526797
16                1245           393 -0.520147
3                 1271           454 -0.473623
9                 3048          1497 -0.341254

Top 10 female-dominated professions:
            male_count  female_count       SPD
profession                                    
7                  183          2384  0.857421
13                1127         11189  0.816986
15                 173           973  0.698080
27                 166           910  0.691450
12                 840          4027  0.654818
10                 182           7

In [None]:
import contractions
import re

def text_preprocessing(text):
    text = contractions.fix(text)
    text = re.compile(r'[^a-zA-Z\s]').sub('', text)
    text = text.lower()

    return text

df_train['hard_text'] = df_train['hard_text'].apply(text_preprocessing)
df_train

Unnamed: 0,hard_text,profession,gender
0,he is also the project lead of and major contr...,21,0
1,she is able to assess diagnose and treat minor...,13,1
2,prior to law school brittni graduated magna cu...,2,1
3,he regularly contributes to indias first onlin...,11,0
4,he completed his medical degree at northwester...,21,0
...,...,...,...
257473,she photographs advertising images people in b...,18,1
257474,he is also a graduate of communications and me...,24,0
257475,she says one of the first steps in stopping ma...,21,1
257476,he has been photographing for twenty years and...,18,0


In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model     = AutoModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
texts = df_train["hard_text"].astype(str).tolist()

batch_size  = 32
all_embeds  = []

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding texts"):
    batch_texts = texts[i : i + batch_size]
    enc = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        out = model(**enc)
        mask = enc.attention_mask.unsqueeze(-1)
        summed = (out.last_hidden_state * mask).sum(dim=1) 
        counts = mask.sum(dim=1)                              
        embeds = summed / counts                              

    all_embeds.append(embeds.cpu())

all_embeds = torch.cat(all_embeds, dim=0)

df_emb = pd.DataFrame(
    all_embeds.numpy(),
    index=df_train.index,
    columns=[f"bert_{i}" for i in range(all_embeds.size(1))]
)
df_with_embeds = pd.concat([df_train, df_emb], axis=1)


Embedding texts: 100%|██████████| 8047/8047 [18:26<00:00,  7.28it/s]


In [None]:
df_with_embeds.to_csv("bias_bio.csv")

In [None]:
import pandas as pd

df = pd.read_csv(r"bias_bio.csv")
df

Unnamed: 0.1,Unnamed: 0,hard_text,profession,gender,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,0,he is also the project lead of and major contr...,21,0,-0.552472,0.198489,0.063155,-0.113847,0.246832,-0.247720,...,0.129424,0.025715,-0.173352,-0.568826,-0.155410,0.092701,-0.097327,0.005641,-0.014364,-0.193853
1,1,she is able to assess diagnose and treat minor...,13,1,-0.124509,0.316716,0.197425,-0.553563,0.379681,0.234262,...,0.136512,-0.182619,0.134923,-0.246130,0.010168,-0.307255,-0.291354,-0.325736,-0.367257,-0.049648
2,2,prior to law school brittni graduated magna cu...,2,1,-0.176944,-0.172039,-0.018906,-0.122894,0.418507,-0.001086,...,-0.411852,-0.184850,-0.174962,-0.238606,-0.181527,-0.237631,-0.261914,0.090514,0.025917,-0.255375
3,3,he regularly contributes to indias first onlin...,11,0,-0.088859,0.116156,0.051059,-0.443465,0.385870,-0.438942,...,0.302845,-0.034522,0.023852,-0.276153,0.155575,0.192830,-0.343404,-0.009655,0.348982,-0.270146
4,4,he completed his medical degree at northwester...,21,0,-0.257002,0.152502,-0.219609,-0.225551,0.407044,-0.003528,...,-0.278749,-0.227291,0.073163,-0.236492,-0.203731,-0.174544,-0.409652,0.019067,0.003538,-0.081264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257473,257473,she photographs advertising images people in b...,18,1,0.113031,0.367829,0.447019,-0.182448,0.519989,0.023667,...,-0.219878,-0.308026,0.089657,-0.365025,-0.133047,-0.162497,0.249960,-0.413363,0.043187,-0.410550
257474,257474,he is also a graduate of communications and me...,24,0,0.237993,0.545080,0.186260,-0.233517,0.354598,-0.023954,...,-0.080567,-0.032702,0.057746,-0.415375,0.032238,-0.182061,-0.033551,-0.220913,0.112022,-0.154106
257475,257475,she says one of the first steps in stopping ma...,21,1,-0.281658,-0.022328,-0.117648,0.009384,0.441398,-0.311745,...,-0.250823,0.002421,-0.159233,-0.208962,0.005864,0.079399,0.051664,-0.199639,-0.082506,-0.102495
257476,257476,he has been photographing for twenty years and...,18,0,0.121734,0.569741,0.241901,-0.361173,0.186113,-0.138743,...,-0.098016,-0.200847,-0.068791,-0.541246,-0.333990,-0.274469,0.109139,-0.129710,-0.209629,-0.441885
