In [None]:
import pickle
import os
from tqdm import tqdm 
from datasets import load_dataset
import pickle
import numpy as np
from datasets import Dataset, DatasetDict

In [None]:
FULL_MERGE_AVAILABLE = False

## Patch for Huggingface Dataset

This adds our review results to the huggingface dataset "LabHC/bias_in_bios" found at https://huggingface.co/datasets/LabHC/bias_in_bios.

In [None]:
if not FULL_MERGE_AVAILABLE:
    dataset = load_dataset("LabHC/bias_in_bios")

    # labels in the order specified here: https://huggingface.co/datasets/LabHC/bias_in_bios
    # not included in the datasets metadata
    huggingface_label = ['accountant', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist', 'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer', 'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']
    partitions = ['train', 'test', 'dev']

    dataset['train'].info

    # load the additional info
    with open('data/huggingface_patch.pkl', 'rb') as handle:
        review_data = pickle.load(handle)

In [None]:
if not FULL_MERGE_AVAILABLE:
    # merge data
    merged_dataset = {part: [] for part in partitions}
    dup_id_list = []

    for part in partitions:
        print(part)
        for i, sample in tqdm(enumerate(dataset[part])):
            text = sample['hard_text']
            patch = review_data[part][i]
            new_sample = sample.copy()

            if patch['review'] == 1:
                sample['gender'] = patch['gender_supervised']
            new_sample.update({'titles_supervised': patch['titles_supervised'], 'review': patch['review'], 'valid': patch['valid']})
            merged_dataset[part].append(new_sample)

## Load complete merged dataset if available

In [None]:
if FULL_MERGE_AVAILABLE:
    with open('data/huggingface_merge_complete.pkl', 'rb') as handle:
        review_data = pickle.load(handle)

## Filter the dataset and convert to huggingface dataset

In [None]:
# select the fields to keep from the dataset (if non-reviewed samples are used, some fields may not be set for those not included in the review!)
KEYS_TO_COPY = ['hard_text', 'profession', 'gender', 'raw', 'titles_supervised', 'review', 'valid', 'name']
CLASSES = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'photographer', 'physician', 'attorney', 'journalist']

with open('merged.pkl', 'rb') as handle:
    merged_dataset = pickle.load(handle)

# filter the dataset: Copies the keys specified above and select a subset of classes (from the review). If non-reviewed samples are kept (and classes not part of the review) 
# there may be samples with empty fields (e.g. supervised titles). Name and raw text are only available if the full merged dataset (not the patch) is available.
#   classes: a (sub)set of classes to keep (should be part of the review)
#   single label: only keep those samples with a single label (otherwise get one-hot encoded labels)
#   review only: only those samples that were part of the review
#   valid only: only those considered valid in the review
def filter_dataset(dataset: dict, classes: list, single_label=True, review_only=True, valid_only=True):
    splits = dataset.keys()
    split_dict = {}
    filtered_dataset = {split: [] for split in splits}
    for split in splits:
        for elem in dataset[split]:
            if valid_only and elem['valid'] != 1:
                continue
            if review_only and elem['review'] != 1:
                continue
            sel_titles = [title for title in elem['titles_supervised'] if title in classes]
            if single_label and len(sel_titles) > 1:
                continue
            if len(sel_titles) == 0:
                continue
                
            new_entry = {k: elem[k] for k in KEYS_TO_COPY}
            if single_label:
                label = classes.index(sel_titles[0])
            else: # multi-label / one-hot encoded
                label = np.zeros(len(classes))
                for title in sel_titles:
                    label[classes.index(title)] = 1
            new_entry.update({'label': label})
            filtered_dataset[split].append(new_entry)
        print(len(filtered_dataset[split]))
        
        cur_split = {k: [elem[k] for elem in filtered_dataset[split]] for k in filtered_dataset[split][0].keys()}
        split_dict[split] = Dataset.from_dict(cur_split, split=split)
    return DatasetDict(split_dict)

fdata = filter_dataset(merged_dataset, CLASSES, True, True, True)

In [None]:
fdata