In [2]:
import sys
from pathlib import Path
base_path = Path('..')
sys.path.insert(0, str(base_path))

In [3]:
import tqdm
import torch
import pickle
import warnings
import vec2text
import numpy as np
import pandas as pd
from typing import List
import ot # !pip install POT
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
warnings.filterwarnings(action='ignore')

def log_print(str_to_print):
    eval_str_to_print = eval(str_to_print)
    if type(eval_str_to_print) == list:
        eval_str_to_print = '\n'.join(eval_str_to_print)
    print(f"{str_to_print}: {eval_str_to_print}")

IS_FIRST = False
MAX_SEQUENCE_LENGTH = 64
DEVICE = 'cuda:2' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 512

# Load our bias-bios model trained on sequences up to 64 tokens length

In [4]:
BASE_MODEL = 'bias-bios64'
corrector = vec2text.load_corrector(BASE_MODEL)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
"""
For the first run, set IS_FIRST=True in order to downlaod the bias-bios dataset. 
"""
if IS_FIRST:
    !wget -r --no-clobber --no-parent -R "index.html*" -nH --cut-dirs=4 -P bios_data https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/
Resolving nlp.biu.ac.il (nlp.biu.ac.il)... 132.70.196.153
Connecting to nlp.biu.ac.il (nlp.biu.ac.il)|132.70.196.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html.tmp’


2024-01-24 18:28:51 (53.9 MB/s) - ‘bios_data/index.html.tmp’ saved [2703/2703]

Loading robots.txt; please ignore errors.
--2024-01-24 18:28:51--  https://nlp.biu.ac.il/robots.txt
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 404 Not Found
2024-01-24 18:28:51 ERROR 404: Not Found.

Removing bios_data/index.html.tmp since it should be rejected.

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/?C=N;O=D
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html?

In [6]:
with open("bios_data/bios_train.pickle", "rb") as f:
  bios_train = pickle.load(f)

with open("bios_data/bios_dev.pickle", "rb") as f:
  bios_dev = pickle.load(f)

with open("bios_data/bios_test.pickle", "rb") as f:
  bios_test = pickle.load(f)

In [7]:
"""
A look into the dataset
"""
bios_train_df = pd.DataFrame(bios_train)
bios_dev_df = pd.DataFrame(bios_dev)
bios_test_df = pd.DataFrame(bios_test)
log_print('bios_train_df.shape');log_print('bios_dev_df.shape');log_print('bios_test_df.shape');bios_test_df

bios_train_df.shape: (255710, 7)
bios_dev_df.shape: (39369, 7)
bios_test_df.shape: (98344, 7)


Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender
0,m,attorney,"Mark Chauvin Bezinque, Esq. is an attorney who...",209,Mr. Bezinque helps clients regain control of t...,Mr. Bezinque helps clients regain control of t...,_. _ helps clients regain control of their liv...
1,f,professor,Pamela Burke Martin is an assistant professor ...,58,She has a Ph.D. from the University of Marylan...,She has a Ph.D. from the University of Marylan...,_ has a Ph.D. from the University of Maryland....
2,f,attorney,Ms. Stacy Tromble is a Staff Attorney with NVL...,49,She is admitted to practice before the U.S. Co...,She is admitted to practice before the U.S. Co...,_ is admitted to practice before the U.S. Cour...
3,m,professor,Farid Abdel-Nour is an associate professor of ...,168,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...
4,m,professor,Sergio F. Ochoa is an Assistant Professor of C...,89,He received his PhD in Computer Science from C...,He received his PhD in Computer Science from C...,_ received _ PhD in Computer Science from Cath...
...,...,...,...,...,...,...,...
98339,m,psychologist,"Joe Bavonese, PhD is a Licensed Psychologist i...",57,Joe is the Founder and Director of the Relatio...,Joe is the Founder and Director of the Relatio...,_ is the Founder and Director of the Relations...
98340,f,teacher,Julia Stacey is a paraprofessional teacher at ...,50,She graduated from Hawaii Pacific University w...,She graduated from Hawaii Pacific University w...,_ graduated from Hawaii Pacific University wit...
98341,f,teacher,Mariam Noronha is a teacher with over nine yea...,72,She has taught a wide range of management rela...,She has taught a wide range of management rela...,_ has taught a wide range of management relate...
98342,m,surgeon,Dr. John Silverton is a leading plastic surgeo...,129,"Over the decades , he has treated numerous ind...","Over the decades, he has treated numerous indi...","Over the decades, _ has treated numerous indiv..."


# Preprocess

In [8]:
z_train = bios_train_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()
z_dev = bios_dev_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()

y_train = bios_train_df['p'].to_numpy()
y_dev = bios_dev_df['p'].to_numpy()

## Tokenize & Extract embedding

In [10]:
def get_gtr_embeddings(text_list, encoder, tokenizer):
    samples_len = [len(s) for s in tokenizer(text_list)['input_ids']]

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=MAX_SEQUENCE_LENGTH,
                       truncation=True,
                       padding="max_length", ).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state,
                                                           inputs['attention_mask']).cpu().detach().numpy()

    return embeddings, samples_len


encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")

In [11]:
encodings = []

for i in tqdm.tqdm(range(0, len(bios_train_df), BATCH_SIZE)):
    sents_batch = bios_train_df.loc[i:i+BATCH_SIZE-1, 'hard_text'].tolist()
    embeddings, samples_len = get_gtr_embeddings(sents_batch, encoder, tokenizer)
    encodings.append(embeddings)
x = np.concatenate(encodings, axis=0)
x = x[:len(bios_train_df)] # Nan are added to the last batch, let's remove them

100%|██████████| 500/500 [04:38<00:00,  1.80it/s]


In [23]:
x.shape, z_train.shape, z_train

((255710, 768),
 (255710,),
 array([False,  True,  True, ...,  True,  True,  True]))

# Generate counterfactuals

In [13]:
SAMPLE_SIZE=5

## MiMiC - Steering technique

In [12]:
female_mask = (z_train == 0)
x_source = x[female_mask]
x_target = x[~female_mask]

ot_linear = ot.da.LinearTransport(reg=1e-7)
ot_linear.fit(Xs=x_source, Xt=x_target)
train_x_transformed = x.copy()
train_x_transformed[female_mask] = ot_linear.transform(Xs=x_source) # MiMiC

### Project sample to natural language space

In [17]:
for i in range(SAMPLE_SIZE):
    print("Original: ", bios_train_df.iloc[female_mask].reset_index(drop=True).loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[female_mask][i]).unsqueeze(0).cuda().float(), corrector=corrector)[0])
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  He has been working with Black children for over 15 years, and he is a true believer in the importance of freedom. He ’s always been able to share his love of youth and the importance of social activism. He ’s been at Black Camp Toronto ’s
----------------
Original:  She has more 20 years of experience in the field of Neurosurgery . Dr. Konstantia Stoforou is currently the Director at NeuroCure , Greece . Her clinical interest includes Neurosurgical Oncology of Brain , Spine , Skull Base , Endoscopic Pituitary Surgery and Skull Base , Microsurgery , Neuroendoscopy , Spinal Neuromodulation for Pain Surgery , Gamma Knife Radiation . In 2003 , she was co - author in two chapters in a 

In [18]:
"""
Note!
You can actually get better counterfactuals by specifying n_steps>1 and sequence_beam_width>0
"""
for i in range(SAMPLE_SIZE):
    print("Original: ", bios_train_df.iloc[female_mask].reset_index(drop=True).loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[female_mask][i]).unsqueeze(0).cuda().float(), 
                                                      corrector=corrector, 
                                                      num_steps=20, sequence_beam_width=4)[0])
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  he loves and believes in the importance of his own life and has been working with Black Schools Toronto for the past 12 years, partnering with Freedom Camp
----------------
Original:  She has more 20 years of experience in the field of Neurosurgery . Dr. Konstantia Stoforou is currently the Director at NeuroCure , Greece . Her clinical interest includes Neurosurgical Oncology of Brain , Spine , Skull Base , Endoscopic Pituitary Surgery and Skull Base , Microsurgery , Neuroendoscopy , Spinal Neuromodulation for Pain Surgery , Gamma Knife Radiation . In 2003 , she was co - author in two chapters in a book named " Neoplasms - Cranial Base Tumors , " " Skull Base Surgery " and extensiv

## MiMiC+ - Steering technique

In [19]:
alpha=2
z_train = z_train[:len(x)]
x_source = x[female_mask]
x_target = x[~female_mask]

mean_source = np.mean(x_source, axis=0)
mean_target = np.mean(x_target, axis=0)


ot_linear = ot.da.LinearTransport(reg=1e-7)
ot_linear.fit(Xs=x_source, Xt=x_target)
train_x_transformed = x.copy()
train_x_transformed[female_mask] = ot_linear.transform(Xs=x_source) # MiMiC
train_x_transformed[female_mask] = train_x_transformed[female_mask] + alpha*(mean_target - mean_source) # MiMiC+

### Project sample to natural language space

In [20]:
for i in range(SAMPLE_SIZE):
    print("Original: ", bios_train_df[z_train==0].reset_index(drop=True).loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[female_mask][i]).unsqueeze(0).cuda().float(), corrector=corrector)[0])
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  He has a passion for his own life and his work has been recognized by his students. He ’s been a great time at Black Boys ’ Toronto Freedom Camp. He believes in the importance of a sound and knowledge to help his students achieve their goals.
----------------
Original:  She has more 20 years of experience in the field of Neurosurgery . Dr. Konstantia Stoforou is currently the Director at NeuroCure , Greece . Her clinical interest includes Neurosurgical Oncology of Brain , Spine , Skull Base , Endoscopic Pituitary Surgery and Skull Base , Microsurgery , Neuroendoscopy , Spinal Neuromodulation for Pain Surgery , Gamma Knife Radiation . In 2003 , she was co - author in two chapters in

# LEACE - Erasure technique

In [26]:
from concept_erasure import LeaceEraser
x_t = torch.tensor(x)
eraser = LeaceEraser.fit(x_t, torch.tensor(female_mask))
x_erased = eraser(x_t)

### Project sample to natural language space

In [27]:
for i in range(SAMPLE_SIZE):
    print("Original: ", bios_train_df[z_train==0].reset_index(drop=True).loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(x_erased[female_mask][i]).unsqueeze(0).cuda().float(), corrector=corrector)[0])
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  He has been working with children for over 15 years, and is a proud member of the Toronto Black Freedom School ’s camp. He believes in the importance of self - awareness and the importance of social change in children ’s lives.
----------------
Original:  She has more 20 years of experience in the field of Neurosurgery . Dr. Konstantia Stoforou is currently the Director at NeuroCure , Greece . Her clinical interest includes Neurosurgical Oncology of Brain , Spine , Skull Base , Endoscopic Pituitary Surgery and Skull Base , Microsurgery , Neuroendoscopy , Spinal Neuromodulation for Pain Surgery , Gamma Knife Radiation . In 2003 , she was co - author in two chapters in a book named "