In [38]:
import json
import numpy as np
import itertools

In [17]:
# Some useful functions
def write_json(file_path, data):
    with open(file_path, 'w') as fp:
        json.dump(data, fp)
    
def load_json(file_path):
    with open(file_path, 'r') as fp:
        data = json.load(fp)
    return data

# 1. Load Descriptor

In [11]:
descriptor_path = "../data/ResponsibleNLP/holistic_bias/dataset/descriptors.json"
holistic_descriptors = load_json(descriptor_path)

In [14]:
# select age, gender, race adescriptors
keys = ['age', 'race_ethnicity', 'gender_and_sex']
extracted_descriptors = {}

extracted_descriptors[keys[0]] = holistic_descriptors[keys[0]]
extracted_descriptors[keys[1]] = holistic_descriptors[keys[1]]
extracted_descriptors[keys[2]] = holistic_descriptors[keys[2]]

# 2. Save 

## 2.1 Age mask descriptor 

In [18]:
age_descriptor_path = '../data/mask_token/age.json'
write_json(age_descriptor_path, extracted_descriptors['age'])

## 2.2 Race Mask Descriptor

In [22]:
race_descriptors_ori = extracted_descriptors[keys[1]]
race_descriptors_ori

{'alaska_native': ['Alaska Native'],
 'asian': [{'descriptor': 'Asian', 'preference': 'reviewed'},
  {'descriptor': 'Asian-American', 'preference': 'reviewed'},
  {'descriptor': 'Desi', 'preference': 'reviewed'},
  {'descriptor': 'East Asian', 'preference': 'reviewed'},
  {'descriptor': 'South Asian', 'preference': 'reviewed'},
  {'descriptor': 'Southeast Asian', 'preference': 'reviewed'}],
 'black': [{'descriptor': 'African', 'preference': 'reviewed'},
  {'descriptor': 'African-American', 'preference': 'reviewed'},
  {'descriptor': 'Black', 'preference': 'reviewed'}],
 'latinx': [{'descriptor': 'Hispanic', 'preference': 'reviewed'},
  {'descriptor': 'Latinx', 'preference': 'polarizing'},
  {'descriptor': 'Latine', 'preference': 'polarizing'},
  {'descriptor': 'Latina', 'gender': 'female', 'preference': 'reviewed'},
  {'descriptor': 'Latino', 'gender': 'male', 'preference': 'reviewed'},
  {'descriptor': 'Latin American', 'preference': 'reviewed'}],
 'indigenous': ['Aboriginal',
  'Indi

In [25]:
race_descriptors_new = {'alaska_native': ['alaska native'], 
                        'asian': ['asian', 'asian-american', 'desi', 'east asian', 'south asian', 'southeast asian'],
                        'black': ['african', 'african-american', 'black'],
                        'latinx': ['hispanic', 'latinx', 'latine', 'latina', 'latino', 'latin american'],
                        'indigenous':['aboriginal','indigenous','native american', 'american indian'],
                        'native_hawaiian': ['native hawaiian'],
                        'pacific_islander': ['pacific islander'],
                        'white': ['european', 'european-american', 'middle eastern', 'white', 'caucasian', 'arabic'],
                        'combined': ['aapi', 'bipoc']
                       }

In [None]:
race_descriptor_path = '../data/mask_token/race.json'
write_json(race_descriptor_path, race_descriptors_new)

## 2.3 Gender Mask Descriptor

In [None]:
gender_descriptor_path = '../data/mask_token/gender.json'

In [27]:
extracted_descriptors[keys[2]]

{'binary': ['female', 'male', 'gender-conforming'],
 'cisgender': ['cis', 'cisgender'],
 'descriptors': ['AFAB',
  'AMAB',
  'androgynous',
  'butch',
  'effeminate',
  'feminine',
  'femme',
  'manly',
  'masculine',
  'womanly'],
 'non_binary_or_gnc': ['agender',
  'androgyne',
  'bigender',
  'enby',
  'gender neutral',
  'gender non-conforming',
  'genderfluid',
  'genderless',
  'genderqueer',
  'neutrois',
  'nonbinary',
  'non-binary',
  'two-spirit'],
 'queer': ['queer'],
 'sex': ['hermaphrodite', 'intersex', 'endosex', 'transsexual'],
 'transgender': ['FTM',
  'F2M',
  'MTF',
  'M2F',
  'trans',
  'trans female',
  'trans feminine',
  'trans fem',
  'trans femme',
  'trans male',
  'trans masculine',
  'trans masc',
  'transgender']}

In [28]:
# Here we just extract male and female words 
descriptor_gender_path = "../data/ResponsibleNLP/holistic_bias/dataset/nouns.json"
holistic_gender_descriptors = load_json(descriptor_gender_path)

In [29]:
holistic_gender_descriptors

{'female': [['woman', 'women'],
  ['lady', 'ladies'],
  ['girl', 'girls'],
  ['mother', 'mothers'],
  ['mom', 'moms'],
  ['daughter', 'daughters'],
  ['wife', 'wives'],
  ['grandmother', 'grandmothers'],
  ['grandma', 'grandmas'],
  ['sister', 'sisters']],
 'male': [['man', 'men'],
  ['bro', 'bros'],
  ['guy', 'guys'],
  ['boy', 'boys'],
  ['father', 'fathers'],
  ['dad', 'dads'],
  ['son', 'sons'],
  ['husband', 'husbands'],
  ['grandfather', 'grandfathers'],
  ['grandpa', 'grandpas'],
  ['brother', 'brothers']],
 'neutral': [['individual', 'individuals'],
  ['person', 'people'],
  ['kid', 'kids'],
  ['parent', 'parents'],
  ['child', 'children'],
  ['spouse', 'spouses'],
  ['grandparent', 'grandparents'],
  ['sibling', 'siblings'],
  ['veteran', 'veterans']]}

In [35]:
gender_descriptor_new = {}
gender_descriptor_new['female'] = list(itertools.chain(*holistic_gender_descriptors['female']))
gender_descriptor_new['male'] = list(itertools.chain(*holistic_gender_descriptors['male']))

In [36]:
gender_descriptor_new

{'female': ['woman',
  'women',
  'lady',
  'ladies',
  'girl',
  'girls',
  'mother',
  'mothers',
  'mom',
  'moms',
  'daughter',
  'daughters',
  'wife',
  'wives',
  'grandmother',
  'grandmothers',
  'grandma',
  'grandmas',
  'sister',
  'sisters'],
 'male': ['man',
  'men',
  'bro',
  'bros',
  'guy',
  'guys',
  'boy',
  'boys',
  'father',
  'fathers',
  'dad',
  'dads',
  'son',
  'sons',
  'husband',
  'husbands',
  'grandfather',
  'grandfathers',
  'grandpa',
  'grandpas',
  'brother',
  'brothers']}

#### Merge Gender sensitive word to our descriptors 

https://github.com/navid-rekabsaz/GenderBias_IR/blob/master/resources/wordlist_genderspecific.txt 

In [43]:
male_list = ['boy',
'boys',
'brother',
'brothers',
'dad',
'dads',
'father',
'fathers',
'fiance',
'gentleman',
'gentlemen',
'godfather',
'grandfather',
'grandpa',
'grandson',
'grandsons',
'guy',
'he',
'him',
'himself',
'his',
'lad',
'lads',
'male',
'males',
'man',
'men',
'sir',
'son',
'sons',
'stepfather',
'stepson']

In [44]:
female_list = [
'daughter',
'daughters',
'female',
'females',
'fiancee',
'gal',
'gals',
'girl',
'girls',
'granddaughter',
'granddaughters',
'grandma',
'grandmother',
'grandmothers',
'her',
'hers',
'herself',
'lady',
'madam',
'mama',
'mom',
'mommy',
'moms',
'mother',
'mothers',
'she',
'sister',
'sisters',
'stepmother',
'stepdaughter',
'woman',
'women'
]

In [47]:
gender_descriptor_new['female'].extend(female_list)
# remove duplicate
gender_descriptor_new['female'] = list(set(gender_descriptor_new['female']))

['daughters',
 'ladies',
 'granddaughter',
 'fiancee',
 'female',
 'mama',
 'stepdaughter',
 'her',
 'moms',
 'daughter',
 'grandma',
 'herself',
 'girl',
 'mothers',
 'grandmother',
 'grandmothers',
 'stepmother',
 'women',
 'girls',
 'mom',
 'hers',
 'granddaughters',
 'wife',
 'mommy',
 'sister',
 'lady',
 'woman',
 'sisters',
 'gals',
 'wives',
 'she',
 'mother',
 'madam',
 'grandmas',
 'females',
 'gal']

In [49]:
gender_descriptor_new['male'].extend(male_list)
# remove duplicate
gender_descriptor_new['male'] = list(set(gender_descriptor_new['male']))

['father',
 'guy',
 'stepfather',
 'son',
 'bro',
 'grandfather',
 'lads',
 'dad',
 'bros',
 'males',
 'gentleman',
 'brothers',
 'men',
 'husband',
 'boys',
 'man',
 'fathers',
 'boy',
 'dads',
 'brother',
 'male',
 'himself',
 'sons',
 'grandpas',
 'guys',
 'grandsons',
 'husbands',
 'stepson',
 'fiance',
 'grandfathers',
 'his',
 'he',
 'sir',
 'grandpa',
 'gentlemen',
 'godfather',
 'him',
 'grandson',
 'lad']

In [None]:
write_json('../data/mask_token/gender.json', gender_descriptor_new)

# 3. Merge all sensitive words to descriptors

In [55]:
descriptors = []
for key, val in race_descriptors_new.items():
    descriptors.extend(val)

for key, val in extracted_descriptors['age'].items():
    descriptors.extend(val)

for key, val in gender_descriptor_new.items():
    descriptors.extend(val)

In [58]:
descriptors

['alaska native',
 'asian',
 'asian-american',
 'desi',
 'east asian',
 'south asian',
 'southeast asian',
 'african',
 'african-american',
 'black',
 'hispanic',
 'latinx',
 'latine',
 'latina',
 'latino',
 'latin american',
 'aboriginal',
 'indigenous',
 'native american',
 'american indian',
 'native hawaiian',
 'pacific islander',
 'european',
 'european-american',
 'middle eastern',
 'white',
 'caucasian',
 'arabic',
 'aapi',
 'bipoc',
 'adolescent',
 'teen',
 'teenage',
 'teenaged',
 'young',
 'younger',
 'twenty-year-old',
 '20-year-old',
 'twenty-five-year-old',
 '25-year-old',
 'thirty-year-old',
 '30-year-old',
 'thirty-five-year-old',
 '35-year-old',
 'forty-year-old',
 '40-year-old',
 'twenty-something',
 'thirty-something',
 'forty-five-year-old',
 '45-year-old',
 'fifty-year-old',
 '50-year-old',
 'fifty-five-year-old',
 '55-year-old',
 'sixty-year-old',
 '60-year-old',
 'forty-something',
 'fifty-something',
 'sixty-something',
 'middle-aged',
 'sixty-five-year-old',
 '6

In [None]:
write_json('../data/mask_token/mask_all.json', descriptors)

# 4. Create each word's label

In [56]:
word_label = {}
for key, vals in race_descriptors_new.items():
    for val in vals:
        word_label[val] = key
        
for key, vals in extracted_descriptors['age'].items():
    for val in vals:
        word_label[val] = key
        
for key, vals in gender_descriptor_new.items():
    for val in vals:
        word_label[val] = key

In [57]:
write_json('../data/mask_token/word_label.json', word_label)

{'alaska native': 'alaska_native',
 'asian': 'asian',
 'asian-american': 'asian',
 'desi': 'asian',
 'east asian': 'asian',
 'south asian': 'asian',
 'southeast asian': 'asian',
 'african': 'black',
 'african-american': 'black',
 'black': 'black',
 'hispanic': 'latinx',
 'latinx': 'latinx',
 'latine': 'latinx',
 'latina': 'latinx',
 'latino': 'latinx',
 'latin american': 'latinx',
 'aboriginal': 'indigenous',
 'indigenous': 'indigenous',
 'native american': 'indigenous',
 'american indian': 'indigenous',
 'native hawaiian': 'native_hawaiian',
 'pacific islander': 'pacific_islander',
 'european': 'white',
 'european-american': 'white',
 'middle eastern': 'white',
 'white': 'white',
 'caucasian': 'white',
 'arabic': 'white',
 'aapi': 'combined',
 'bipoc': 'combined',
 'adolescent': 'child',
 'teen': 'child',
 'teenage': 'child',
 'teenaged': 'child',
 'young': 'young',
 'younger': 'young',
 'twenty-year-old': 'young',
 '20-year-old': 'young',
 'twenty-five-year-old': 'young',
 '25-year-o

# 5. Save all labels

In [65]:
all_labels = list(race_descriptors_new.keys()) + list(extracted_descriptors['age'].keys()) + list(gender_descriptor_new.keys())
all_labels

['alaska_native',
 'asian',
 'black',
 'latinx',
 'indigenous',
 'native_hawaiian',
 'pacific_islander',
 'white',
 'combined',
 'child',
 'young',
 'middle_aged',
 'old',
 'adult',
 'female',
 'male']

In [None]:
write_json('../data/mask_token/categories.json', all_labels)