In [1]:
import tqdm
import torch
import pickle
import vec2text
import numpy as np
import pandas as pd
from typing import List
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

def log_print(str_to_print):
    eval_str_to_print = eval(str_to_print)
    if type(eval_str_to_print) == list:
        eval_str_to_print = '\n'.join(eval_str_to_print)
    print(f"{str_to_print}: {eval_str_to_print}")

IS_FIRST = False
BASE_MODEL = 'gtr-base'
MAX_SEQUENCE_LENGTH = 32
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 512

In [2]:
corrector = vec2text.load_corrector(BASE_MODEL)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [1]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(corrector.model)

NameError: name 'corrector' is not defined

In [2]:
253025280 / 1_000_000

253.02528

In [7]:
if IS_FIRST:
    !wget -r --no-clobber --no-parent -R "index.html*" -nH --cut-dirs=4 -P bios_data https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/
Resolving nlp.biu.ac.il (nlp.biu.ac.il)... 132.70.196.153
Connecting to nlp.biu.ac.il (nlp.biu.ac.il)|132.70.196.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html.tmp’


2024-01-24 18:28:51 (53.9 MB/s) - ‘bios_data/index.html.tmp’ saved [2703/2703]

Loading robots.txt; please ignore errors.
--2024-01-24 18:28:51--  https://nlp.biu.ac.il/robots.txt
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 404 Not Found
2024-01-24 18:28:51 ERROR 404: Not Found.

Removing bios_data/index.html.tmp since it should be rejected.

--2024-01-24 18:28:51--  https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/?C=N;O=D
Reusing existing connection to nlp.biu.ac.il:443.
HTTP request sent, awaiting response... 200 OK
Length: 2703 (2.6K) [text/html]
Saving to: ‘bios_data/index.html?

In [4]:
#download the bios_train, bios_dev, bios_test files from https://nlp.biu.ac.il/~ravfogs/rlace-cr/bios/bios_data/

with open("bios_data/bios_train.pickle", "rb") as f:
  bios_train = pickle.load(f)

with open("bios_data/bios_dev.pickle", "rb") as f:
  bios_dev = pickle.load(f)

with open("bios_data/bios_test.pickle", "rb") as f:
  bios_test = pickle.load(f)

In [88]:
"""
A look into the dataset
"""
bios_train_df = pd.DataFrame(bios_train)
bios_dev_df = pd.DataFrame(bios_dev)
bios_test_df = pd.DataFrame(bios_test)
log_print('bios_train_df.shape');log_print('bios_dev_df.shape');log_print('bios_test_df.shape');bios_test_df

bios_train_df.shape: (255710, 7)
bios_dev_df.shape: (39369, 7)
bios_test_df.shape: (98344, 7)


Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender
0,m,attorney,"Mark Chauvin Bezinque, Esq. is an attorney who...",209,Mr. Bezinque helps clients regain control of t...,Mr. Bezinque helps clients regain control of t...,_. _ helps clients regain control of their liv...
1,f,professor,Pamela Burke Martin is an assistant professor ...,58,She has a Ph.D. from the University of Marylan...,She has a Ph.D. from the University of Marylan...,_ has a Ph.D. from the University of Maryland....
2,f,attorney,Ms. Stacy Tromble is a Staff Attorney with NVL...,49,She is admitted to practice before the U.S. Co...,She is admitted to practice before the U.S. Co...,_ is admitted to practice before the U.S. Cour...
3,m,professor,Farid Abdel-Nour is an associate professor of ...,168,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...,Related StoriesEducation and Behavior in Israe...
4,m,professor,Sergio F. Ochoa is an Assistant Professor of C...,89,He received his PhD in Computer Science from C...,He received his PhD in Computer Science from C...,_ received _ PhD in Computer Science from Cath...
...,...,...,...,...,...,...,...
98339,m,psychologist,"Joe Bavonese, PhD is a Licensed Psychologist i...",57,Joe is the Founder and Director of the Relatio...,Joe is the Founder and Director of the Relatio...,_ is the Founder and Director of the Relations...
98340,f,teacher,Julia Stacey is a paraprofessional teacher at ...,50,She graduated from Hawaii Pacific University w...,She graduated from Hawaii Pacific University w...,_ graduated from Hawaii Pacific University wit...
98341,f,teacher,Mariam Noronha is a teacher with over nine yea...,72,She has taught a wide range of management rela...,She has taught a wide range of management rela...,_ has taught a wide range of management relate...
98342,m,surgeon,Dr. John Silverton is a leading plastic surgeo...,129,"Over the decades , he has treated numerous ind...","Over the decades, he has treated numerous indi...","Over the decades, _ has treated numerous indiv..."


# Preprocess for gender prediction

In [89]:
# text_train_df = bios_train_df[['hard_text']]
# text_dev_df = bios_dev_df[['hard_text']]
# text_test_df = bios_test_df[['hard_text']]

z_train = bios_train_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()
z_dev = bios_dev_df['g'].replace('f', 0).replace('m', 1).astype(int).to_numpy()

y_train = bios_train_df['p'].to_numpy()
y_dev = bios_dev_df['p'].to_numpy()

In [91]:
def encode(corrector, sents: List):
    inputs = corrector.embedder_tokenizer(
        sents,
        return_tensors="pt",
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        padding="max_length",
    )
    inputs = inputs.to(DEVICE)
    with torch.no_grad():
        frozen_embeddings = corrector.inversion_trainer.call_embedding_model(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
        )
    return frozen_embeddings.detach().cpu().numpy()

In [65]:
# tqdm.tqdm.pandas()
# bios_train_df['batch_group'] = bios_train_df.index // BATCH_SIZE
# bios_train_df['embedding'] = bios_train_df.groupby('batch_group').progress_apply(lambda batch: encode(corrector, batch['hard_text'].tolist()))

100%|██████████| 500/500 [03:27<00:00,  2.41it/s]


In [92]:
encodings = []

for i in tqdm.tqdm(range(0, len(bios_train_df), BATCH_SIZE)):
    sents_batch = bios_train_df.loc[i:i+BATCH_SIZE, 'hard_text'].tolist()
    encodings.append(encode(corrector, sents_batch))
x = np.concatenate(encodings, axis=0)
x = x[:len(bios_train_df)] # Nan are added to the last batch, let's remove them

100%|██████████| 500/500 [03:38<00:00,  2.29it/s]


In [93]:
x = x[:len(bios_train_df)] # Nan are added to the last batch, let's remove them

In [56]:
!pip install POT

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting POT
  Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (30 kB)
Downloading POT-0.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (823 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.3


In [58]:
x.shape


(255710,)

In [65]:
z_train.shape

(255710,)

In [66]:
z_train

array([0, 1, 1, ..., 1, 1, 1])

In [73]:
mean_source.shape

(512, 768)

In [74]:
x_source.shape

(117588,)

In [80]:
x

array([array([[-0.0497366 , -0.00015856, -0.00924454, ..., -0.01336172,
               -0.00856602,  0.01299139],
              [-0.02049584,  0.023458  ,  0.00439336, ..., -0.03785922,
                0.06920397,  0.01912325],
              [-0.0175851 ,  0.01037407, -0.017476  , ..., -0.01886234,
               -0.04385024, -0.01259412],
              ...,
              [-0.02673947,  0.03209867,  0.01383689, ..., -0.04575445,
                0.03480659, -0.01481058],
              [-0.0231976 ,  0.01479353, -0.01672669, ..., -0.01648185,
               -0.01150068,  0.01943053],
              [-0.03965582,  0.04591341, -0.02192282, ..., -0.03959581,
                0.02463359,  0.02900159]], dtype=float32)              ,
       array([[-0.01509149,  0.0214959 ,  0.02386822, ...,  0.00423233,
               -0.0327222 , -0.01192302],
              [-0.06285723,  0.02630466,  0.00207998, ..., -0.02741456,
                0.02783123, -0.00455019],
              [-0.03087803,  0.0343696

In [87]:
x.shape

(256209, 768)

In [86]:
z_train.shape

(255710,)

In [94]:
import ot # !pip install POT


# generate counterfactuals

z_train = z_train[:len(x)]
x_source = x[z_train==0]
x_target = x[z_train==1]

mean_source = np.mean(x_source, axis=0)
mean_target = np.mean(x_target, axis=0)


ot_linear = ot.da.LinearTransport(reg=1e-7)
ot_linear.fit(Xs=x_source, Xt=x_target)
train_x_transformed = x.copy()
train_x_transformed[z_train==0] = ot_linear.transform(Xs=x_source) # optimal transport intervention
#train_x_transformed[z_train==1] = train_x_transformed[z_train==0] + (mean_target - mean_source) # steering vector intervention


In [97]:
for i in range(30):
    print("Original: ", bios_train_df.loc[i, 'hard_text'])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[i]).unsqueeze(0).cuda().float(), corrector=corrector))
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  ['She has worked with children in the community for the past 8 years and cultivating a sense of self-importance. She believes that she is developing camp']
----------------
Original:  He holds a PhD in Biosystems Engineering from University College Dublin ( Ireland ) . He previously held academic positions at University College Dublin ( Ireland ) , Dublin Institute of Technology ( Ireland ) and Harper Adams University ( UK ) . He was a PhD research assistant at Teagasc Irish Agriculture and Food Development Authority . His current research focus is on PLF applications , focussing on real - time modelling and control of animal bio - responses . Tomas started as daily coordinator of 

KeyboardInterrupt: 

In [None]:
for i in range(len(bios_train_df)):
    bios_train_df.loc[i, 'transformed_hard_text'] = vec2text.invert_embeddings(torch.tensor(train_x_transformed[i]).unsqueeze(0).cuda().float(), corrector=corrector)
bios_train_df.to_csv('bios_data/bios_train_df.csv', index=False)

In [10]:
for i in range(30):
    print("Original: ", text_train[i])
    print("Transformed: ", vec2text.invert_embeddings(torch.tensor(train_x_transformed[i]).unsqueeze(0).cuda().float(), corrector=corrector))
    print("----------------")

Original:  She has been working with children in camp , community and school settings for the past 8 years . She believes in the importance of cultivating self - love and awareness in black children at a very young age and is excited to be apart of Black Lives Matter Toronto ’s Freedom School !
Transformed:  ['In the last 10 years, he has worked with a variety of camps and programs. He believes that he has grown in importance to cultivating']
----------------
Original:  He holds a PhD in Biosystems Engineering from University College Dublin ( Ireland ) . He previously held academic positions at University College Dublin ( Ireland ) , Dublin Institute of Technology ( Ireland ) and Harper Adams University ( UK ) . He was a PhD research assistant at Teagasc Irish Agriculture and Food Development Authority . His current research focus is on PLF applications , focussing on real - time modelling and control of animal bio - responses . Tomas started as daily coordinator of the EU - PLF projec

In [12]:
# sentences = ["Angelina, 31, is a very nice person. She is very kind and helpful.",
# "She is a devoted mother to three children, and works as a nurse.",
# "Jane likes to go to the park and play with her children.",
# "Patricia is waiting to find her true love.",
# "Anna is working as a administrator in a hospital.",
# "She decided to leave her career to become a stay-at-home mom.",
# "She always wanted to be a mother, to connect to her femininity, and is very happy with her decision.",
# "The common theme in her life is her love for children."]
# for sentence in sentences:
#     vec = encode(corrector, [sentence])[0]
#     vec += 3.0 * (mean_target - mean_source)
#     #vec = ot_linear.transform(Xs=vec)[0]
#     print("Original: ", sentence)
#     print("Counterfactual: ", vec2text.invert_embeddings(torch.tensor(vec).unsqueeze(0).cuda().float(), corrector=corrector))
#     print("----------------")


In [2]:
import pandas as pd
df = pd.read_csv('../bios_data/bios_train_df.csv')
df[['hard_text', 'transformed_hard_text']]

Unnamed: 0,hard_text,transformed_hard_text
0,"She has been working with children in camp , c...",years. He believes this is a great time to wor...
1,He holds a PhD in Biosystems Engineering from ...,as a PhD in Food Psychology at University Coll...
2,John served as the Department Chair of Foley ’...,John served as the Department Chair of Foley f...
3,She has more 20 years of experience in the fie...,has been a consultant for Spinotomy Neurosurge...
4,Over the years Ash has built an impressive fas...,she is currently presenting at events and fest...
...,...,...
255719,Specializing in curriculum and instructional s...,served as a community leadership specialist in...
255720,"She investigates basic cognitive processes , w...",interest in the study of cognitive processes. ...
255721,He has helped numerous patients in his 15 year...,has completed her career and has become a prac...
255722,He received his Ph.D. from INSEAD . He is inte...,She is interested in gaining her PhD from INSE...


In [3]:
import pandas as pd
df_w_20_steps = pd.read_csv(bios_train_bsw_2_n_steps_5_df.csv)
df_w_20_steps

FileNotFoundError: [Errno 2] No such file or directory: 'bios_train_bsw_2_n_steps_5_df.csv'

In [6]:
pd.set_option('display.max_colwidth', None)

In [7]:
df.loc[100, ['hard_text', 'transformed_hard_text']]

hard_text                Dr. Korendovych received his PhD from Tufts University for his work on oxygen activation . He then moved on to the University of Pennsylvania Medical School where he developed small protein catalysts . Since 2011 , he has taught at Syracuse University . His current research interests are focused on discovery of antimicrobial drugs as well as creating biosensors for metals . Dr. Korendovych is a recipient of the 2007 Young Investigator Award from the American Chemical Society in Inorganic Chemistry .
transformed_hard_text                                                                                                                                                                                                                                                                                                                                                                               earned PhD in Molecular Therapeutics. In recent years, Kordynov is involved in res

In [5]:
df.loc[100, ['hard_text', 'transformed_hard_text']]['hard_text'], df.loc[100, ['hard_text', 'transformed_hard_text']]['transformed_hard_text']

('Dr. Korendovych received his PhD from Tufts University for his work on oxygen activation . He then moved on to the University of Pennsylvania Medical School where he developed small protein catalysts . Since 2011 , he has taught at Syracuse University . His current research interests are focused on discovery of antimicrobial drugs as well as creating biosensors for metals . Dr. Korendovych is a recipient of the 2007 Young Investigator Award from the American Chemical Society in Inorganic Chemistry .',
 'earned PhD in Molecular Therapeutics. In recent years, Kordynov is involved in research in the area of neuroscience. In 2007, she was invited')

In [9]:
"""
This example expose the inherent shovinizem by the introduced by the model, even when the bias was supposed to be removed.
"""
i=101
df.loc[i]['hard_text'], df.loc[i]['transformed_hard_text']

('Most of her clients are women in their 30s , most of their problems start with a few glasses of wine after work and she says joking about over - drinking is a defence mechanism taken on by women concerned about their habit .',
 "30's, has said that drinking is a joke about women. He has described it as a defence mechanism against over-compensating problems")

In [6]:
import pandas as pd
df_w_20_steps = pd.read_csv('../vec2text_inter/bios_data/bios_train_bsw_4_n_steps_20_df.csv')
sample(df_w_20_steps)

(131932,
 'Kent began his employment in the Library of Congress in 1971 as an examiner in the Arts Section ( now the Visual Arts Division ) of the Copyright Office . In 1973 , Kent was promoted to staff attorney on the legal staff of the Examining Division . In 1977 , Kent ’s position was transferred to the General Counsel ’s Office in a reorganization of the Copyright Office . Kent served as Chief Negotiator of the Guild for more than 15 years , taking a lead position in bargaining collectively with management over master contract issues , and at mid - term . Currently , Kent is serving as a steward in Library Services II .',
 'took office in 1984, Kent was placed as an Assistant to the Chief Legal Adviser to the Office of the Chief Legal Adviser to the National')

In [16]:
sample(df_w_20_steps)

(33439,
 'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .',
 'Since the late 2000s she has been working in the area of file sharing and document management. She is proficient in PDF and PDF/NET formats. She')

In [20]:
sample(df)

(116547,
 'His paintings have been exhibited at the Heckscher Museum , the Nassau County Museum of Art , The Delaware Art Museum , Old Westbury Gardens , the Firehouse Gallery , Katrina Rich Perlow Gallery NYC , the Salmagundi Club , the Sky Art Gallery NYC , Long Island Art Museum , and the Grenning Gallery in Sag Harbor , NY .',
 'She is in the Gettysburg County Museum of Art. Her shows include: Newark, KS Gallery, West 141st Street, Dahl')

In [16]:
df = pd.read_csv('../bios_data/bios_train_df.csv')
sample(df)

(167369,
 'He graduated with a Bachelor of Chemical Engineering and a PhD in Biology from NUS . He did his postdoctoral studies in Howard Hughes Medical Institute , University of Pennsylvania Medical School and returned to NUS as a Lee Kuan Yew Postdoctoral Fellow and later joined his alma mater department as an Asst Professor in 2010 . He has contributed to diverse topics spanning from chemical synthesis to sensor development ; nanosafety to nanomedicine topics .',
 'attended the University of Singapore as a nuclear pathologist after completing her M.Sc. in Health Science in 2010 and has been on faculty at')

In [2]:
import numpy as np
def sample(df):
    i = np.random.randint(0, df.shape[0])
    return i, df.loc[i]['hard_text'], df.loc[i]['transformed_hard_text']
sample(df)

NameError: name 'df' is not defined

In [14]:
import numpy as np
def sample(df):
    i = np.random.randint(0, df.shape[0])
    return i, df.loc[i]['hard_text'], df.loc[i]['transformed_hard_text']
sample(df) # Beautiful

(187857,
 'His writing -- available at jeffgissing.com-- focuses on the intersection of religion , theology , and culture . His work has been featured on RealClear religion , worldviewchurch.org , VirtueOnline , and layman , org .',
 'Her writing is available on www.jeffgiss.com, focusing on culture, religion, and the intersection of the rich and practical')

In [18]:
sample(df)

(197166,
 'In counseling private investment funds on their formation and investment activities she helps clients navigate the maze of laws and regulations when raising and deploying private capital . Elizabeth also has substantial experience on advising strategic and financial buyers , sellers and investors in a wide variety of transactions , including mergers , acquisitions , divestitures , joint ventures , minority investments and strategic partnerships in the U.S. and overseas . Elizabeth graduated from the University of Illinois College of Law ( JD ) and Truman State University ( BA in French ) . Elizabeth splits her time between Eagle County and Denver and is passionate about spending time with family and friends outside in the Colorado sunshine .',
 'has studied in securities markets and in private equity and investment banking. He has been a consultant to various state and local government agencies and organizations: “Co')

In [19]:
sample(df)

(239782,
 'He has covered cryptocurrency news for CoinDesk and VICE Motherboard , and contributed features on a range of topics to BuzzFeed , Ars Techica and the Atlantic ’s CityLab among others .',
 'She covered Bitcoin news for VegCourt and has contributed to various forums such as Mothers & Daughters, VEX, and Asia')

In [20]:
sample(df)

(169861,
 'He holds a master of fine arts from the Royal College of Art and a bachelor of fine arts from the Rhode Island School of Design . For the past 12 years he has led multidisciplinary teams at the IDEO Chicago office for companies in the consumer electronics , medical , and furniture industries .',
 "she holds a master's degree in industrial design from the Royal College of Art and Design, Chicago. Over the past couple of years she has continued")

In [21]:
sample(df)

(120266,
 "Keylon has more than 15 years of experience in science education and research at the undergraduate , graduate and post - doctoral level . He is an enthusiastic and engaging Professor who previously worked at SUNY Schenectady County Community College as the Coordinator of the College 's brand new Biotechnology Facility , and Faculty Member through the Internship Program for the Development of Minority Faculty . Keylon holds a Ph.D. and M.S. degree in Biomedical Sciences with an emphasis in Cell and Molecular Biology and Immunology from Albany Medical College and a B.S. in Biochemistry from Andrews University . Dr. Cheeseman 's currently conducting research mapping the ribosomal DNA ( rDNA ) gene of Chamaelirium luteum 's genome .",
 'Brooklyn, NY - 104-104. Keylon has been a Senior Academician of the Faculty of Education and Technology at the Faculty of')

In [22]:
sample(df)

(28685,
 'He studies the use of research , assessment , and evaluation to improve both classroom instruction and student outcomes . Specifically , he investigates value - added models , assessment validity , and the analysis of aggregated unit pre / post - assessments to study the impact of interventions on student subgroups .',
 'assessments. She evaluates research and development in the area of teacher evaluation, inclusion, and/or reinforcement of student outcomes, in addition to the use models')

In [7]:
sample(df)

NameError: name 'sample' is not defined

In [2]:
import pandas as pd
import numpy as np
def sample(df):
    i = np.random.randint(0, df.shape[0])
    return i, df.loc[i]['hard_text'], df.loc[i]['transformed_hard_text']
df_w_20_steps = pd.read_csv('../vec2text_inter/bios_data/bios_train_bsw_4_n_steps_20_df.csv')
sample(df_w_20_steps)

(184959,
 "His areas of expertise include the following : adrenalectomy ( adrenal surgery ) , laparoscopic sleeve gastrectomy , and metabolic syndrome . Dr. Kammerer honors United Healthcare Platinum , United Healthcare Navigate , Coventry , and more . He attended St. George 's University School of Medicine and then went on to complete his residency at Orlando Regional Medical Center . His professional affiliations include Methodist Hospital Division of Thomas Jefferson University Hospital and Thomas Jefferson University Hospital .",
 nan)

In [9]:
df_w_20_steps[~df_w_20_steps['transformed_hard_text'].isna()]

Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender,transformed_hard_text
263,m,architect,Juancarlos Lazarte is an architect by professi...,49.0,His dedication to building design allowed him ...,His dedication to building design allowed him ...,_ dedication to building design allowed _ to w...,In dedication and design work allowed her to w...
295,f,professor,~Dr. Keri L. Colabroy is an Associate Professo...,83.0,She engages undergraduates in research in the ...,She engages undergraduates in research in the ...,_ engages undergraduates in research in the te...,He engages undergraduates in teaching research...
341,m,professor,Mark H. Dixon is an Associate Professor of Phi...,54.0,His primary research interests are in environm...,His primary research interests are in environm...,_ primary research interests are in environmen...,Her research interests are environmental philo...
490,m,professor,Prof. Fernando Rosario-Ortiz is an Associate P...,170.0,His research focuses on environmental photoche...,His research focuses on environmental photoche...,_ research focuses on environmental photochemi...,Her research focuses on the environmental impa...
520,f,journalist,Kalea Yoshida is a freelance journalist who ha...,185.0,She attained her undergraduate degree from the...,She attained her undergraduate degree from the...,_ attained _ undergraduate degree from the Uni...,He attained his undergraduate degree from the ...
...,...,...,...,...,...,...,...,...
254559,f,physician,Linda Biehl is a certified Physician Assistant...,135.0,She obtained a Bachelor of Science degree in P...,She obtained a Bachelor of Science degree in P...,_ obtained a Bachelor of Science degree in Phy...,He obtained a Bachelor of Science degree in Ph...
254600,m,professor,Dr Gorkem Altinors is an Assistant Professor a...,154.0,His research interests extend from internation...,His research interests extend from internation...,_ research interests extend from international...,She is currently working with several research...
254898,m,psychologist,Dr Tom Williamson is a psychologist who develo...,187.0,Since his retirement he has been made a senior...,Since his retirement he has been made a senior...,Since _ retirement _ has been made a senior re...,since her retirement she has been an adjunct l...
255143,f,journalist,Firdose Moonda is a journalist based in Cape T...,50.0,Most of her work is concerned with sports writ...,Most of her work is concerned with sports writ...,Most of _ work is concerned with sports writin...,Most of his work is concerned with culture but...


In [30]:
df_w_20_steps[~df_w_20_steps['transformed_hard_text'].isna()].to_csv('bios_train_bs__rs_20.csv')

In [14]:
n_samples = 30
for i in range(n_samples):
    sub_df = df_w_20_steps[~df_w_20_steps['transformed_hard_text'].isna()].reset_index().loc[i, ['hard_text', 'transformed_hard_text']]
    print(i)
    print(sub_df['hard_text'])
    print(sub_df['transformed_hard_text'])

0
His dedication to building design allowed him to work in a variety of commercial , institutional and residential projects in various parts of the world .
In dedication and design work allowed her to work in a variety of institutional projects around the world, including those involving the building's residential area.
1
She engages undergraduates in research in the teaching lab and through independent study investigating enzymes in bacterial natural product biosynthesis .
He engages undergraduates in teaching research in the lab through independent study in investigating enzymes in bacterial biosynthesis  natural product .
2
His primary research interests are in environmental philosophy and environmental ethics . He has published articles in environmental ethics and in the philosophy of architecture .
Her research interests are environmental philosophy and environmental ethics. She has published articles on environmental philosophy and ethics. She has also written articles on the pri

In [27]:
df_w_20_steps.loc[~df_w_20_steps['transformed_hard_text'].isna(), 'text_length'] = df_w_20_steps[~df_w_20_steps['transformed_hard_text'].isna()]['text'].apply(lambda x: len(x.split()))
df_w_20_steps[df_w_20_steps['text_length']<=32]

Unnamed: 0,g,p,text,start,hard_text,hard_text_untokenized,text_without_gender,transformed_hard_text,text_length
263,m,architect,Juancarlos Lazarte is an architect by professi...,49.0,His dedication to building design allowed him ...,His dedication to building design allowed him ...,_ dedication to building design allowed _ to w...,In dedication and design work allowed her to w...,31.0
623,f,attorney,"Kimberly L. Wallen is an attorney in Denver, C...",48.0,She focuses primarily on representing clients ...,She focuses primarily on representing clients ...,_ focuses primarily on representing clients in...,He focuses primarily on representing clients i...,29.0
786,m,professor,Dr. Katsuhiro Sano is an assistant professor a...,72.0,His research interests revolve around temporal...,His research interests revolve around temporal...,_ research interests revolve around temporal a...,Her research interests revolve around prehisto...,31.0
1542,m,professor,Faisal N. Abu-Khzam is an associate professor ...,66.0,"His research interests include graph theory , ...","His research interests include graph theory, c...","_ research interests include graph theory, com...",Her research interests include: combinatorial ...,27.0
3809,m,poet,Rajendra Anantrai Shukla is a Gujarati poet. H...,44.0,He taught at various places before voluntarily...,He taught at various places before voluntarily...,_ taught at various places before voluntarily ...,several poetry awards. She taught and publishe...,28.0
...,...,...,...,...,...,...,...,...,...
252117,m,photographer,Hemant Daiya is a professional photographer. H...,44.0,He is currently working for a reputed hospital...,He is currently working for a reputed hospital...,_ is currently working for a reputed hospital ...,She is currently working as a visual consultan...,29.0
252344,m,dentist,"Dr. David Ehrenfeld, DDS is a Dentist primaril...",74.0,His specialties include General Dentistry . Dr...,His specialties include General Dentistry. Dr....,_ specialties include General Dentistry. Dr. _...,is affiliated with the Philadelphia Department...,32.0
253256,f,paralegal,Francesca York is a paralegal at Ruchelman P.L...,51.0,She provides legal and administrative assistan...,She provides legal and administrative assistan...,_ provides legal and administrative assistance...,He provides legal and administrative support t...,29.0
253500,f,professor,Tatiana S. Manolova is an associate professor ...,70.0,Her research focuses on the competitive strate...,Her research focuses on the competitive strate...,_ research focuses on the competitive strategi...,His research focuses on the competitive strate...,29.0


In [28]:
df_w_20_steps[df_w_20_steps['text_length']<=32].shape

(167, 9)