In [1]:
import pickle
import os
import pandas as pd
import json
from tqdm import tqdm 
from datasets import load_dataset
import pickle
import numpy as np

In [2]:
BIOS_FILE_REVIEWED = "data/BIOS_REVIEWED.pkl" # only reviewed samples
BIOS_FILE_COMPLETE = "data/BIOS_COMPLETE.pkl" # whole dataset with changes to the reviewed samples
BIOS_LABELS = "data/BIOS_LABELS.csv" # this contains all of our changes without the sensitive data, but ids matching the BIOS dataset
BIOS_RAW = "../../data/biosbias2023/BIOS.pkl" # "data/BIOS.pkl"
TITLE_JSON = "data/title_lookup.json"

reviewed_classes = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'photographer', 'physician', 'attorney', 'journalist']

In [3]:
with (open(BIOS_RAW, "rb")) as openfile:
    raw_data = pickle.load(openfile)

In [4]:
df_label = pd.read_csv(BIOS_LABELS, sep='\t')
labeled_data = df_label.to_dict('records')

  df_label = pd.read_csv(BIOS_LABELS, sep='\t')


In [5]:
for i in range(len(labeled_data)):
    if not (raw_data[i]['raw_title'] == labeled_data[i]['auto_raw_title'] and raw_data[i]['start_pos'] == labeled_data[i]['start_pos']):
        print("data mismatch at id ", i)
        
    labeled_data[i]['raw'] = raw_data[i]['raw']
    labeled_data[i]['bio'] = raw_data[i]['bio']
    labeled_data[i]['name'] = raw_data[i]['name']
    
    # convert titles from nan/string to list
    if type(labeled_data[i]['titles']) == float:
        labeled_data[i]['titles'] = []
    elif type(labeled_data[i]['titles']) == str:
        titles = labeled_data[i]['titles'].strip(']"\'[').split('\', \'')
        labeled_data[i]['titles'] = titles
        
    # convert raw titles fron nan/string to list
    if type(labeled_data[i]['raw_titles']) == float:
        labeled_data[i]['raw_titles'] = []
    elif type(labeled_data[i]['titles']) == str:
        titles = labeled_data[i]['raw_titles'].strip(']"\'[').split(', ')
        labeled_data[i]['raw_titles'] = titles

In [6]:
df = pd.DataFrame(data=labeled_data)
df_reviewed = df[df['review'] == 1]
reviewed_data = df_reviewed.to_dict('records')

In [7]:
with (open(BIOS_FILE_COMPLETE, "wb")) as openfile:
    pickle.dump(labeled_data, openfile)

with (open(BIOS_FILE_REVIEWED, "wb")) as openfile:
    pickle.dump(reviewed_data, openfile)

## Update: Prepare the reviewed data for merging with huggingface dataset

In [8]:
dataset = load_dataset("LabHC/bias_in_bios")

# labels in the order specified here: https://huggingface.co/datasets/LabHC/bias_in_bios
# not included in the datasets metadata
huggingface_label = ['accountant', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist', 'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer', 'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']
partitions = ['train', 'test', 'dev']

dataset['train'].info

Found cached dataset parquet (/home/saschroeder/.cache/huggingface/datasets/LabHC___parquet/LabHC--bias_in_bios-0590f29daf9e7342/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetInfo(description='', citation='', homepage='', license='', features={'hard_text': Value(dtype='string', id=None), 'profession': Value(dtype='int64', id=None), 'gender': Value(dtype='int64', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, config_name=None, version=None, splits={'train': SplitInfo(name='train', num_bytes=107552255, num_examples=257478, shard_lengths=None, dataset_name='parquet'), 'test': SplitInfo(name='test', num_bytes=41337024, num_examples=99069, shard_lengths=None, dataset_name='parquet'), 'dev': SplitInfo(name='dev', num_bytes=16514329, num_examples=39642, shard_lengths=None, dataset_name='parquet')}, download_checksums={'https://huggingface.co/datasets/LabHC/bias_in_bios/resolve/052f01de644dba841176e0449528b41f27d94a61/data/train-00000-of-00001-0ab65b32c47407e8.parquet': {'num_bytes': 64908645, 'checksum': None}, 'https://huggingface.co/datasets/LabHC/bias_in_bios/resolve/052f01de644dba841176e0449528b41f27d94a61/

In [9]:
for sample in labeled_data:
    sample['match_text'] = sample['raw'][sample['start_pos']:].strip()

In [10]:
MATCH_PKL = 'matches.pkl'
if not os.path.isfile(MATCH_PKL):
    matches = {part: [] for part in partitions}
    reverse_matches = [[] for sample in labeled_data]

    for part in partitions:
        matches[part] = []
        print(part)
        for i in tqdm(range(len(dataset[part]))):
            text = dataset[part][i]['hard_text']
            f = list(filter(lambda labeled_data: labeled_data['match_text'] == text, labeled_data))
            potential_matches = []
            for elem in f:
                if elem['auto_title'] == huggingface_label[dataset[part][i]['profession']]:
                    potential_matches.append(elem['Unnamed: 0'])
                    reverse_matches[elem['Unnamed: 0']].append((part,i))
            matches[part].append((i,potential_matches))

    with open(MATCH_PKL, 'wb') as handle:
        pickle.dump({'matches': matches, 'reverse_matches': reverse_matches}, handle)
    
else:
    print("load precomputed matches")
    with open(MATCH_PKL, 'rb') as handle:
        loaded_matches = pickle.load(handle)
        matches = loaded_matches['matches']
        reverse_matches = loaded_matches['reverse_matches']   

load precomputed matches


In [11]:
for part in partitions:
    print(part)
    count0 = 0
    count_double = 0

    for match in matches[part]:
        if match[1] == []:
            count0 += 1
        elif len(match[1]) > 1:
            count_double += 1

    print("got %i samples in total" % len(matches[part]))
    print("for %i samples found no match in the labeled data" % count0)
    print("for %i samples found multiple matches in the labeled data" % count_double)
    

count0 = 0
count_multi = 0
for match in reverse_matches:
    n_matches = len(match)
    if n_matches == 0:
        count0 += 1
    if n_matches > 1:
        count_multi += 1
        
print("based on the crawled dataset")
print("got %i samples in total" % len(reverse_matches))
print("for %i samples found no match in the labeled data" % count0)
print("for %i samples found multiple matches in the labeled data" % count_multi)

train
got 257478 samples in total
for 1197 samples found no match in the labeled data
for 933 samples found multiple matches in the labeled data
test
got 99069 samples in total
for 456 samples found no match in the labeled data
for 343 samples found multiple matches in the labeled data
dev
got 39642 samples in total
for 171 samples found no match in the labeled data
for 145 samples found multiple matches in the labeled data
based on the crawled dataset
got 397907 samples in total
for 3530 samples found no match in the labeled data
for 1412 samples found multiple matches in the labeled data


In [12]:
dataset_for_filter = []
for part in partitions:
    for i, sample in enumerate(dataset[part]):
        sample_for_filter = sample.copy()
        sample_for_filter.update({'split': part, 'id': i})
        dataset_for_filter.append(sample_for_filter)
        
dataset_for_filter[0]

{'hard_text': 'He is also the project lead of and major contributor to the open source assembler/simulator "EASy68K." He earned a master’s degree in computer science from the University of Michigan-Dearborn, where he is also an adjunct instructor. Downloads/Updates',
 'profession': 21,
 'gender': 0,
 'split': 'train',
 'id': 0}

In [13]:
#from unidecode import unidecode
#unidecode('kožušček')

In [14]:
merged_dataset = {part: [] for part in partitions}
dup_id_list = []

for part in partitions:
    print(part)
    for match in tqdm(matches[part]):
        idx = match[0]
        sample = dataset[part][idx]
        text = sample['hard_text']

        # duplicates in the huggingface dataset:
        f = list(filter(lambda dataset_for_filter: dataset_for_filter['hard_text'] == text, dataset_for_filter))
        sample.update({'text_duplicates': [(elem['split'], elem['id']) for elem in f if (elem['id'] != idx or elem['split'] != part)]})

        if len(match[1]) == 0:
            # keep old label
            sample.update({'titles_supervised': [], 'raw_titles_supervised': [], 'gender_supervised': '', 'auto_title': '', 'auto_raw_title': '', 'path': '', 'URI': '', 
                           'review': 0, 'valid': -1, 'style_valid': -1, 'raw': '', 'bio': '', 'name': ('',''), 'duplicates': []})
            merged_dataset[part].append(sample)
        elif len(match[1]) > 1:
            dup_id_list.append(match[0])
            duplicates = []
            for m_id in match[1]:
                for d_tup in reverse_matches[m_id]:
                    if d_tup[1] != idx and d_tup not in duplicates:
                        duplicates.append(d_tup)

            same_number_of_duplicates = (len(duplicates)+1 == len(match[1]))
            all_equal = True
            if not same_number_of_duplicates:
                base_sample = labeled_data[match[1][0]]
                #if len(duplicates) > 0 and len(match[1]) > 1:
                #    print("inconsistent number of duplicates: %i vs. %i" % (len(duplicates)+1, len(match[1])))
                #    print(idx, match[1][0], "base sample: ")
                #    print_sample(base_sample)
                for next_id in match[1][1:]:
                    comp_sample = labeled_data[next_id]
                    if not (comp_sample['titles'] == base_sample['titles'] and comp_sample['gender'] == base_sample['gender']):
                        all_equal = False
                assert all_equal, "inconsistent number of duplicates in both dataset versions"+str(match[1])+ " vs. " + str(duplicates)+" for idx "+str(idx)

                # take first possible match
                match_id = match[1][0]
                merge_sample = labeled_data[match_id]

                # assign other match ids to duplicates (1:1), handle inconsistent numbers of duplicates!
                for i, (d_split,d_idx) in enumerate(duplicates):
                    if i+1 >= len(matches[d_split][d_idx][1]):
                        i -= len(matches[d_split][d_idx][1])
                    matches[d_split][d_idx] = (matches[d_split][d_idx][0], [matches[d_split][d_idx][1][i+1]])
                    #print(d_split, d_idx, matches[d_split][d_idx][1], "->", [matches[d_split][d_idx][1][i]])
            else:
                # take first possible match
                match_id = match[1][0]
                merge_sample = labeled_data[match_id]

                # assign other match ids to duplicates (1:1)
                for i, (d_split,d_idx) in enumerate(duplicates):
                    matches[d_split][d_idx] = (matches[d_split][d_idx][0], [matches[d_split][d_idx][1][i+1]])
                    #print(matches[d_split][d_idx][1], "->", [matches[d_split][d_idx][1][i]])

            sample.update({'titles_supervised': merge_sample['titles'], 'raw_titles_supervised': merge_sample['raw_titles'], 'gender_supervised': merge_sample['gender'], 
                           'auto_title': merge_sample['auto_title'], 'auto_raw_title': merge_sample['auto_raw_title'],  'path': merge_sample['path'], 'URI': merge_sample['URI'], 
                           'review': merge_sample['review'], 'valid': merge_sample['valid'], 'style_valid': merge_sample['style_valid'], 'raw': merge_sample['raw'], 
                           'bio': merge_sample['bio'], 'name': merge_sample['name'], 'duplicates': duplicates})
            merged_dataset[part].append(sample)

        else:
            match_id = match[1][0]
            merge_sample = labeled_data[match_id]
            sample.update({'titles_supervised': merge_sample['titles'], 'raw_titles_supervised': merge_sample['raw_titles'], 'gender_supervised': merge_sample['gender'], 
                           'auto_title': merge_sample['auto_title'], 'auto_raw_title': merge_sample['auto_raw_title'],  'path': merge_sample['path'], 'URI': merge_sample['URI'], 
                           'review': merge_sample['review'], 'valid': merge_sample['valid'], 'style_valid': merge_sample['style_valid'], 'raw': merge_sample['raw'], 
                           'bio': merge_sample['bio'], 'name': merge_sample['name'], 'duplicates': []})
            merged_dataset[part].append(sample)



train


100%|███████████████████████████████████| 257478/257478 [3:04:58<00:00, 23.20it/s]


test


100%|█████████████████████████████████████| 99069/99069 [1:10:43<00:00, 23.35it/s]


dev


100%|███████████████████████████████████████| 39642/39642 [28:18<00:00, 23.34it/s]


In [15]:
for i, sample in enumerate(merged_dataset[part]):
    if len(sample['duplicates']) > 0:
        for (split, idx) in sample['duplicates']:
            if len(merged_dataset[split][idx]['duplicates']) == 0:
                dup = [tup for tup in sample['duplicates'] if not tup == (split,idx)]
                dup.append((part, i))
                merged_dataset[split][idx]['duplicates'] = dup

In [16]:
with open('data/huggingface_merge_complete.pkl', 'wb') as handle:
    pickle.dump(merged_dataset, handle)

In [17]:
public_data = {part: [] for part in partitions}
keys_to_copy = ['text_duplicates', 'titles_supervised', 'raw_titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'review', 'valid', 'style_valid', 'duplicates']

for part in partitions:
    for entry in merged_dataset[part]:
        reduced_entry = {k: entry[k] for k in keys_to_copy}
        public_data[part].append(reduced_entry)
        
with open('data/huggingface_patch.pkl', 'wb') as handle:
    pickle.dump(public_data, handle)

## TODO: remove all this??

In [142]:
merge_label = huggingface_label
for key, split in merged_dataset.items():
    for sample in split:
        for title in sample['titles_supervised']:
            if title not in merge_label:
                merge_label.append(title)
                
merge_label.remove('')
for title in merge_label:
    title = title.replace('_',' ')
    
merge_label = list(set(merge_label))

In [143]:
merge_label

['chiropractor',
 'journalist',
 'rapper',
 'accountant',
 'poet',
 'surgeon',
 '*director/producer',
 'architect',
 'dentist',
 'photographer',
 '*software architect',
 '*editor',
 'professor',
 '*actor',
 'model',
 'composer',
 '*artist/ designer',
 'comedian',
 'painter',
 'yoga teacher',
 '*entrepeneur',
 'psychologist',
 '*writer',
 'personal trainer',
 'nurse',
 'software_engineer',
 'filmmaker',
 '*trainer',
 '*researcher',
 'dj',
 'interior_designer',
 'dietitian',
 '*consultant/coaches',
 'yoga_teacher',
 'attorney',
 'teacher',
 'software engineer',
 'pastor',
 'physician',
 'interior designer',
 'paralegal',
 '*engineer',
 'personal_trainer']

In [186]:
for elem in merged_dataset['train']:
    if elem['review'] == 1 and elem['valid'] == 0:
        for k, v in elem.items():
            print(k,v)
        break

hard_text She specialises in helping individuals manage their stress and anxiety and has developed and facilitated many groups on relaxation, stress and anxiety management. Katerina Volny BSc is a psychologist who has worked in public mental health and private practice settings in Melbourne, Australia. She is experienced in cognitive-behavioural techniques to treat a wide variety of individuals who experience stress and anxiety.
profession 22
gender 1
text_duplicates []
titles_supervised []
raw_titles_supervised []
gender_supervised F
auto_title psychologist
auto_raw_title psychologist
path crawl-data/CC-MAIN-2016-44/segments/1476988719677.59/wet/CC-MAIN-20161020183839-00202-ip-10-171-6-4.ec2.internal.warc.wet.gz
URI http://www.boomerangbooks.com.au/Relaxation-Techniques/Lillian-Nejad/book_9781845900786.htm
review 1
valid 0
style_valid -1.0
raw Lillian Nejad PhD is a clinical psychologist who has been working in the public mental health and community health systems in Melbourne, Austra

In [206]:
mini_dataset = {key: [] for key in merged_dataset.keys()}

for key, split in merged_dataset.items():
    mini_dataset[key] = {k: [] for k in split[0].keys() if k not in ['duplicates','text_duplicates', 'raw_titles_supervised']}
    for elem in merged_dataset[key]:
        if elem['review'] == 0:
            elem['titles_supervised'] = [elem['auto_title'].replace('_',' ')]
        #    elem['raw_titles_supervised'] = [elem['auto_raw_title']]
        #if elem['review'] == 1:
        for k, v in elem.items():
            if k in ['duplicates','text_duplicates', 'raw_titles_supervised']:#, 'raw_titles_supervised', 'titles_supervised']:
                continue
            if k in ['auto_title', 'auto_raw_title']:
                mini_dataset[key][k].append(v.replace('_',' '))
            else:
                mini_dataset[key][k].append(v)

In [207]:
print(len(mini_dataset['train']))
print(len(mini_dataset['test']))
print(len(mini_dataset['dev']))

15
15
15


In [208]:
from datasets import Dataset, DatasetDict, DatasetInfo
info = DatasetInfo({'label': merge_label, 'features': mini_dataset['train'].keys(), 'num_rows': len(mini_dataset['train']['hard_text'])})

In [209]:
from datasets import Dataset, DatasetDict, DatasetInfo

mdata_train = Dataset.from_dict(mini_dataset['train'], split='train')
mdata_test = Dataset.from_dict(mini_dataset['test'], split='test')
mdata_dev = Dataset.from_dict(mini_dataset['dev'], split='dev')


In [210]:
mdata = DatasetDict({'train': mdata_train, 'test': mdata_test, 'dev': mdata_dev})

In [211]:
mdata

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 257478
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 99069
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 39642
    })
})

In [212]:
for split in ['train', 'test', 'dev']:
    mdata[split].to_csv(split+'.csv')

Creating CSV from Arrow format:   0%|          | 0/258 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

In [213]:
mdata = {}
for split in ['train', 'test', 'dev']:
    mdata[split] = Dataset.from_csv(split+'.csv')
    
dataset = DatasetDict(mdata)

Downloading and preparing dataset csv/default to /home/saschroeder/.cache/huggingface/datasets/csv/default-2ae8139e51537722/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/saschroeder/.cache/huggingface/datasets/csv/default-2ae8139e51537722/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /home/saschroeder/.cache/huggingface/datasets/csv/default-1ee6525d50e0d906/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/saschroeder/.cache/huggingface/datasets/csv/default-1ee6525d50e0d906/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default to /home/saschroeder/.cache/huggingface/datasets/csv/default-c2edacc0f48fec01/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/saschroeder/.cache/huggingface/datasets/csv/default-c2edacc0f48fec01/0.0.0. Subsequent calls will reuse this data.


In [214]:
dataset

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 257483
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 99070
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 39642
    })
})

In [215]:
dataset['train'][0]

{'hard_text': 'He is also the project lead of and major contributor to the open source assembler/simulator "EASy68K." He earned a master’s degree in computer science from the University of Michigan-Dearborn, where he is also an adjunct instructor. Downloads/Updates',
 'profession': 21,
 'gender': 0,
 'titles_supervised': "['professor']",
 'gender_supervised': 'M',
 'auto_title': 'professor',
 'auto_raw_title': 'associate professor',
 'path': 'crawl-data/CC-MAIN-2016-44/segments/1476988720615.90/wet/CC-MAIN-20161020183840-00099-ip-10-171-6-4.ec2.internal.warc.wet.gz',
 'URI': 'https://www.crcpress.com/Programming-2D-Games/Kelly/p/book/9781466508682',
 'review': 0,
 'valid': -1,
 'style_valid': -1.0,
 'raw': 'Charles Kelly is an associate professor at Monroe County Community College, where he teaches game programming and other computer science courses. He is also the project lead of and major contributor to the open source assembler/simulator "EASy68K." He earned a master’s degree in com

In [216]:


mdata = {}
for split in ['train', 'test', 'dev']:
    mdata[split] = Dataset.from_csv(split+'.csv')
    
dataset = DatasetDict(mdata)

Found cached dataset csv (/home/saschroeder/.cache/huggingface/datasets/csv/default-2ae8139e51537722/0.0.0)
Found cached dataset csv (/home/saschroeder/.cache/huggingface/datasets/csv/default-1ee6525d50e0d906/0.0.0)
Found cached dataset csv (/home/saschroeder/.cache/huggingface/datasets/csv/default-c2edacc0f48fec01/0.0.0)


In [227]:
dataset

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 257483
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 99070
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender', 'titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'path', 'URI', 'review', 'valid', 'style_valid', 'raw', 'bio', 'name'],
        num_rows: 39642
    })
})

In [232]:
for elem in dataset['train']:
    if elem['titles_supervised'] is None:
        print(elem)

{'hard_text': ' She has already put up more than three years in the field of teaching and has remained a hard working and outstanding student through out her career without any external support or help of any sort and secured first Division in all her career pursuits. She has respect for elders and a strong faith in  traditional values of the family. ', 'profession': None, 'gender': None, 'titles_supervised': None, 'gender_supervised': None, 'auto_title': None, 'auto_raw_title': None, 'path': None, 'URI': None, 'review': None, 'valid': None, 'style_valid': None, 'raw': None, 'bio': None, 'name': None}
{'hard_text': 'She is desperate to escape the from the small Australian town in which she lives. Rejection after rejection mean she is stuck in what she sees as a dead-end waitressing job. ', 'profession': None, 'gender': None, 'titles_supervised': None, 'gender_supervised': None, 'auto_title': None, 'auto_raw_title': None, 'path': None, 'URI': None, 'review': None, 'valid': None, 'style_

## TODO:
- (x) save pkl with all info
- (x) save pkl with additional info for huggingface (without privacy/copyrigth issues)
- code example to transform to filter and transform to huggingface dataset object

In [244]:
with open('merged.pkl', 'rb') as handle:
    merged_dataset = pickle.load(handle)

In [267]:
from datasets import Dataset, DatasetDict

KEYS_TO_COPY = ['hard_text', 'profession', 'gender', 'raw', 'titles_supervised', 'review', 'valid', 'name']

def filter_dataset(dataset: dict, classes: list, single_label=True, review_only=True, valid_only=True):
    splits = dataset.keys()
    split_dict = {}
    filtered_dataset = {split: [] for split in splits}
    for split in splits:
        for elem in dataset[split]:
            if valid_only and elem['valid'] != 1:
                continue
            if review_only and elem['review'] != 1:
                continue
            sel_titles = [title for title in elem['titles_supervised'] if title in classes]
            if single_label and len(sel_titles) > 1:
                continue
            if len(sel_titles) == 0:
                continue
                
            new_entry = {k: elem[k] for k in KEYS_TO_COPY}
            if single_label:
                label = classes.index(sel_titles[0])
            else: # multi-label / one-hot encoded
                label = np.ones(len(classes))
                for title in sel_titles:
                    label[classes.index(title)] = 1
            new_entry.update({'label': label})
            filtered_dataset[split].append(new_entry)
        print(len(filtered_dataset[split]))
        
        cur_split = {k: [elem[k] for elem in filtered_dataset[split]] for k in filtered_dataset[split][0].keys()}
        split_dict[split] = Dataset.from_dict(cur_split, split=split)
    return DatasetDict(split_dict)

In [270]:
classes = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'photographer', 'physician', 'attorney', 'journalist']

fdata = filter_dataset(merged_dataset, classes, True, True, True)

6855
2430
1024


In [261]:
fdata

DatasetDict({
    train: Dataset({
        features: ['hard_text', 'profession', 'gender', 'raw', 'titles_supervised', 'review', 'valid', 'name', 'label'],
        num_rows: 7017
    })
    test: Dataset({
        features: ['hard_text', 'profession', 'gender', 'raw', 'titles_supervised', 'review', 'valid', 'name', 'label'],
        num_rows: 2500
    })
    dev: Dataset({
        features: ['hard_text', 'profession', 'gender', 'raw', 'titles_supervised', 'review', 'valid', 'name', 'label'],
        num_rows: 1046
    })
})

In [262]:
fdata['train'][:2]

{'hard_text': ['She graduated in May 2008 from Corban College with a Bachelor of Science in English-Journalism. She currently works as the Office Administrator at her church. Shawnee has been working along side her husband in youth ministry for four and a half years.',
  "Alderfer categorized the lower order needs (Physiological and Safety) into the Existence category. He fit Maslow's interpersonal love and esteem needs into the relatedness category. The growth category contained the Self Actualization and self esteem needs. Alderfer also proposed a regression theory to go along with the ERG theory. He said that when needs in a higher category are not met then individuals redouble the efforts invested in a lower category need. For example if self actualization or self esteem is not met then individuals will invest more effort in the relatedness category in the hopes of achieving the higher need."],
 'profession': [11, 22],
 'gender': [1, 0],
 'raw': ['Shawnee Randolph is a freelance jo