In [3]:
import tqdm
import torch
import pickle
import vec2text
import numpy as np
import pandas as pd
from typing import List
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

def log_print(str_to_print):
    eval_str_to_print = eval(str_to_print)
    if type(eval_str_to_print) == list:
        eval_str_to_print = '\n'.join(eval_str_to_print)
    print(f"{str_to_print}: {eval_str_to_print}")

IS_FIRST = False
BASE_MODEL = 'bias-bios64'
MAX_SEQUENCE_LENGTH = 64
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 512

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
corrector = vec2text.load_corrector(BASE_MODEL)

Loading checkpoint shards: 100%|██████████| 8/8 [00:01<00:00,  5.52it/s]
Loading checkpoint shards: 100%|██████████| 6/6 [00:01<00:00,  4.78it/s]


In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(corrector.model)

253025280

In [7]:
if IS_FIRST:
    !wget -r --no-clobber --no-parent -R "index.html*" -nH --cut-dirs=4 -P bios_data https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/
Resolving nlp.biu.ac.il (nlp.biu.ac.il)... 132.70.196.153
Connecting to nlp.biu.ac.il (nlp.biu.ac.il)|132.70.196.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html.tmp’


2024-01-24 18:28:51 (53.9 MB/s) - ‘bios_data/index.html.tmp’ saved [2703/2703]

Loading robots.txt; please ignore errors.
--2024-01-24 18:28:51--  https://nlp.biu.ac.il/robots.txt
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 404 Not Found
2024-01-24 18:28:51 ERROR 404: Not Found.

Removing bios_data/index.html.tmp since it should be rejected.

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/?C=N;O=D
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html?

In [8]:
#download the bios_train, bios_dev, bios_test files from https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/
BASE = '/home/nlp/matan_avitan/git/vec2text_inter/'
with open(BASE+"bios_data/bios_train.pickle", "rb") as f:
  bios_train = pickle.load(f)

with open(BASE+"bios_data/bios_dev.pickle", "rb") as f:
  bios_dev = pickle.load(f)

with open(BASE+"bios_data/bios_test.pickle", "rb") as f:
  bios_test = pickle.load(f)

In [9]:
"""
A look into the dataset
"""
bios_train_df = pd.DataFrame(bios_train)
bios_dev_df = pd.DataFrame(bios_dev)
bios_test_df = pd.DataFrame(bios_test)
log_print('bios_train_df.shape');log_print('bios_dev_df.shape');log_print('bios_test_df.shape');bios_test_df

bios_train_df.shape: (255710, 7)
bios_dev_df.shape: (39369, 7)
bios_test_df.shape: (98344, 7)


Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender
0,m,attorney,"Mark Chauvin Bezinque, Esq. is an attorney who...",209,Mr. Bezinque helps clients regain control of t...,Mr. Bezinque helps clients regain control of t...,_. _ helps clients regain control of their liv...
1,f,professor,Pamela Burke Martin is an assistant professor ...,58,She has a Ph.D. from the University of Marylan...,She has a Ph.D. from the University of Marylan...,_ has a Ph.D. from the University of Maryland....
2,f,attorney,Ms. Stacy Tromble is a Staff Attorney with NVL...,49,She is admitted to practice before the U.S. Co...,She is admitted to practice before the U.S. Co...,_ is admitted to practice before the U.S. Cour...
3,m,professor,Farid Abdel-Nour is an associate professor of ...,168,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...
4,m,professor,Sergio F. Ochoa is an Assistant Professor of C...,89,He received his PhD in Computer Science from C...,He received his PhD in Computer Science from C...,_ received _ PhD in Computer Science from Cath...
...,...,...,...,...,...,...,...
98339,m,psychologist,"Joe Bavonese, PhD is a Licensed Psychologist i...",57,Joe is the Founder and Director of the Relatio...,Joe is the Founder and Director of the Relatio...,_ is the Founder and Director of the Relations...
98340,f,teacher,Julia Stacey is a paraprofessional teacher at ...,50,She graduated from Hawaii Pacific University w...,She graduated from Hawaii Pacific University w...,_ graduated from Hawaii Pacific University wit...
98341,f,teacher,Mariam Noronha is a teacher with over nine yea...,72,She has taught a wide range of management rela...,She has taught a wide range of management rela...,_ has taught a wide range of management relate...
98342,m,surgeon,Dr. John Silverton is a leading plastic surgeo...,129,"Over the decades , he has treated numerous ind...","Over the decades, he has treated numerous indi...","Over the decades, _ has treated numerous indiv..."


# Preprocess for gender prediction

In [10]:
# text_train_df = bios_train_df[['hard_text']]
# text_dev_df = bios_dev_df[['hard_text']]
# text_test_df = bios_test_df[['hard_text']]

z_train = bios_train_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()
z_dev = bios_dev_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()

y_train = bios_train_df['p'].to_numpy()
y_dev = bios_dev_df['p'].to_numpy()

  z_train = bios_train_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()
  z_dev = bios_dev_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()


In [11]:
def encode(corrector, sents: List):
    inputs = corrector.embedder_tokenizer(
        sents,
        return_tensors="pt",
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        padding="max_length",
    )
    inputs = inputs.to(DEVICE)
    with torch.no_grad():
        frozen_embeddings = corrector.inversion_trainer.call_embedding_model(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
        )
    return frozen_embeddings.detach().cpu().numpy()

In [65]:
# tqdm.tqdm.pandas()
# bios_train_df['batch_group'] = bios_train_df.index // BATCH_SIZE
# bios_train_df['embedding'] = bios_train_df.groupby('batch_group').progress_apply(lambda batch: encode(corrector, batch['hard_text'].tolist()))

100%|██████████| 500/500 [03:27<00:00,  2.41it/s]


In [12]:
encodings = []

for i in tqdm.tqdm(range(0, len(bios_train_df), BATCH_SIZE)):
    sents_batch = bios_train_df.loc[i:i+BATCH_SIZE, 'hard_text'].tolist()
    encodings.append(encode(corrector, sents_batch))
x = np.concatenate(encodings, axis=0)
x = x[:len(bios_train_df)] # Nan are added to the last batch, let's remove them

100%|██████████| 500/500 [03:57<00:00,  2.11it/s]


In [13]:
x = x[:len(bios_train_df)] # Nan are added to the last batch, let's remove them

In [56]:
!pip install POT

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting POT
  Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (30 kB)
Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (823 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.3


In [18]:
x.shape


(255710, 768)

In [19]:
z_train.shape

(255710,)

In [20]:
z_train

array([0, 1, 1, ..., 1, 1, 1])

In [22]:
import ot # !pip install POT


# generate counterfactuals

z_train = z_train[:len(x)]
x_source = x[z_train==0]
x_target = x[z_train==1]

mean_source = np.mean(x_source, axis=0)
mean_target = np.mean(x_target, axis=0)


ot_linear = ot.da.LinearTransport(reg=1e-7)
ot_linear.fit(Xs=x_source, Xt=x_target)
train_x_transformed = x.copy()
train_x_transformed[z_train==0] = ot_linear.transform(Xs=x_source) # MiMiC
#train_x_transformed[z_train==1] = train_x_transformed[z_train==0] + (mean_target - mean_source) # MiMiC+


# Run on a sample

In [24]:
for i in range(30):
    print("Original: ", bios_train_df.loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[i]).unsqueeze(0).cuda().float(), corrector=corrector))
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !




Transformed:  ['She has been working with children for over 15 years and is passionate about the importance of freedom. She believes in a positive awareness of Black children ’s rights and is able to share her love of life with others. She is also a member of the Toronto Black Camps Freedom School']
----------------
Original:  He holds a PhD in Biosystems Engineering from University College Dublin ( Ireland ) . He previously held academic positions at University College Dublin ( Ireland ) , Dublin Institute of Technology ( Ireland ) and Harper Adams University ( UK ) . He was a PhD research assistant at Teagasc Irish Agriculture and Food Development Authority . His current research focus is on PLF applications , focussing on real - time modelling and control of animal bio - responses . Tomas started as daily coordinator of the EU - PLF project ( 2012 - 2016 ) and now coordinates 4 Ph.D. students and Masters students in PLF at M3-BIORES . He is PI and co - PI on collaborative National a

# Run on the entire dataset

In [None]:
for i in range(len(bios_train_df)):
    bios_train_df.loc[i, 'transformed_hard_text'] = vec2text.invert_embeddings(torch.tensor(train_x_transformed[i]).unsqueeze(0).cuda().float(), corrector=corrector)
bios_train_df.to_csv('bios_data/bios_train_df.csv', index=False)