In [None]:
import pickle
import os
import pandas as pd
import json
from tqdm import tqdm 
from datasets import load_dataset
import pickle
import numpy as np

In [None]:
BIOS_FILE_REVIEWED = "data/BIOS_REVIEWED.pkl" # only reviewed samples
BIOS_FILE_COMPLETE = "data/BIOS_COMPLETE.pkl" # whole dataset with changes to the reviewed samples
BIOS_LABELS = "data/BIOS_LABELS.csv" # this contains all of our changes without the sensitive data, but ids matching the BIOS dataset
BIOS_RAW = "../../data/biosbias2023/BIOS.pkl" # "data/BIOS.pkl"
TITLE_JSON = "data/title_lookup.json"

reviewed_classes = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'photographer', 'physician', 'attorney', 'journalist']

In [None]:
with (open(BIOS_RAW, "rb")) as openfile:
    raw_data = pickle.load(openfile)

In [None]:
df_label = pd.read_csv(BIOS_LABELS, sep='\t')
labeled_data = df_label.to_dict('records')

In [None]:
for i in range(len(labeled_data)):
    if not (raw_data[i]['raw_title'] == labeled_data[i]['auto_raw_title'] and raw_data[i]['start_pos'] == labeled_data[i]['start_pos']):
        print("data mismatch at id ", i)
        
    labeled_data[i]['raw'] = raw_data[i]['raw']
    labeled_data[i]['bio'] = raw_data[i]['bio']
    labeled_data[i]['name'] = raw_data[i]['name']
    
    # convert titles from nan/string to list
    if type(labeled_data[i]['titles']) == float:
        labeled_data[i]['titles'] = []
    elif type(labeled_data[i]['titles']) == str:
        titles = labeled_data[i]['titles'].strip(']"\'[').split('\', \'')
        labeled_data[i]['titles'] = titles
        
    # convert raw titles fron nan/string to list
    if type(labeled_data[i]['raw_titles']) == float:
        labeled_data[i]['raw_titles'] = []
    elif type(labeled_data[i]['titles']) == str:
        titles = labeled_data[i]['raw_titles'].strip(']"\'[').split(', ')
        labeled_data[i]['raw_titles'] = titles

In [None]:
df = pd.DataFrame(data=labeled_data)
df_reviewed = df[df['review'] == 1]
reviewed_data = df_reviewed.to_dict('records')

In [None]:
with (open(BIOS_FILE_COMPLETE, "wb")) as openfile:
    pickle.dump(labeled_data, openfile)

with (open(BIOS_FILE_REVIEWED, "wb")) as openfile:
    pickle.dump(reviewed_data, openfile)

## Update: Prepare the reviewed data for merging with huggingface dataset

In [None]:
dataset = load_dataset("LabHC/bias_in_bios")

# labels in the order specified here: https://huggingface.co/datasets/LabHC/bias_in_bios
# not included in the datasets metadata
huggingface_label = ['accountant', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist', 'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer', 'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']
partitions = ['train', 'test', 'dev']

dataset['train'].info

In [None]:
for sample in labeled_data:
    sample['match_text'] = sample['raw'][sample['start_pos']:].strip()

In [None]:
MATCH_PKL = 'matches.pkl'
if not os.path.isfile(MATCH_PKL):
    matches = {part: [] for part in partitions}
    reverse_matches = [[] for sample in labeled_data]

    for part in partitions:
        matches[part] = []
        print(part)
        for i in tqdm(range(len(dataset[part]))):
            text = dataset[part][i]['hard_text']
            f = list(filter(lambda labeled_data: labeled_data['match_text'] == text, labeled_data))
            potential_matches = []
            for elem in f:
                if elem['auto_title'] == huggingface_label[dataset[part][i]['profession']]:
                    potential_matches.append(elem['Unnamed: 0'])
                    reverse_matches[elem['Unnamed: 0']].append((part,i))
            matches[part].append((i,potential_matches))

    with open(MATCH_PKL, 'wb') as handle:
        pickle.dump({'matches': matches, 'reverse_matches': reverse_matches}, handle)
    
else:
    print("load precomputed matches")
    with open(MATCH_PKL, 'rb') as handle:
        loaded_matches = pickle.load(handle)
        matches = loaded_matches['matches']
        reverse_matches = loaded_matches['reverse_matches']   

In [None]:
for part in partitions:
    print(part)
    count0 = 0
    count_double = 0

    for match in matches[part]:
        if match[1] == []:
            count0 += 1
        elif len(match[1]) > 1:
            count_double += 1

    print("got %i samples in total" % len(matches[part]))
    print("for %i samples found no match in the labeled data" % count0)
    print("for %i samples found multiple matches in the labeled data" % count_double)
    

count0 = 0
count_multi = 0
for match in reverse_matches:
    n_matches = len(match)
    if n_matches == 0:
        count0 += 1
    if n_matches > 1:
        count_multi += 1
        
print("based on the crawled dataset")
print("got %i samples in total" % len(reverse_matches))
print("for %i samples found no match in the labeled data" % count0)
print("for %i samples found multiple matches in the labeled data" % count_multi)

In [None]:
dataset_for_filter = []
for part in partitions:
    for i, sample in enumerate(dataset[part]):
        sample_for_filter = sample.copy()
        sample_for_filter.update({'split': part, 'id': i})
        dataset_for_filter.append(sample_for_filter)
        
dataset_for_filter[0]

In [None]:
merged_dataset = {part: [] for part in partitions}
dup_id_list = []

for part in partitions:
    print(part)
    for match in tqdm(matches[part]):
        idx = match[0]
        sample = dataset[part][idx]
        text = sample['hard_text']

        # duplicates in the huggingface dataset:
        f = list(filter(lambda dataset_for_filter: dataset_for_filter['hard_text'] == text, dataset_for_filter))
        sample.update({'text_duplicates': [(elem['split'], elem['id']) for elem in f if (elem['id'] != idx or elem['split'] != part)]})

        if len(match[1]) == 0:
            # keep old label
            sample.update({'titles_supervised': [], 'raw_titles_supervised': [], 'gender_supervised': '', 'auto_title': '', 'auto_raw_title': '', 'path': '', 'URI': '', 
                           'review': 0, 'valid': -1, 'style_valid': -1, 'raw': '', 'bio': '', 'name': ('',''), 'duplicates': []})
            merged_dataset[part].append(sample)
        elif len(match[1]) > 1:
            dup_id_list.append(match[0])
            duplicates = []
            for m_id in match[1]:
                for d_tup in reverse_matches[m_id]:
                    if d_tup[1] != idx and d_tup not in duplicates:
                        duplicates.append(d_tup)

            same_number_of_duplicates = (len(duplicates)+1 == len(match[1]))
            all_equal = True
            if not same_number_of_duplicates:
                base_sample = labeled_data[match[1][0]]
                #if len(duplicates) > 0 and len(match[1]) > 1:
                #    print("inconsistent number of duplicates: %i vs. %i" % (len(duplicates)+1, len(match[1])))
                #    print(idx, match[1][0], "base sample: ")
                #    print_sample(base_sample)
                for next_id in match[1][1:]:
                    comp_sample = labeled_data[next_id]
                    if not (comp_sample['titles'] == base_sample['titles'] and comp_sample['gender'] == base_sample['gender']):
                        all_equal = False
                assert all_equal, "inconsistent number of duplicates in both dataset versions"+str(match[1])+ " vs. " + str(duplicates)+" for idx "+str(idx)

                # take first possible match
                match_id = match[1][0]
                merge_sample = labeled_data[match_id]

                # assign other match ids to duplicates (1:1), handle inconsistent numbers of duplicates!
                for i, (d_split,d_idx) in enumerate(duplicates):
                    if i+1 >= len(matches[d_split][d_idx][1]):
                        i -= len(matches[d_split][d_idx][1])
                    matches[d_split][d_idx] = (matches[d_split][d_idx][0], [matches[d_split][d_idx][1][i+1]])
                    #print(d_split, d_idx, matches[d_split][d_idx][1], "->", [matches[d_split][d_idx][1][i]])
            else:
                # take first possible match
                match_id = match[1][0]
                merge_sample = labeled_data[match_id]

                # assign other match ids to duplicates (1:1)
                for i, (d_split,d_idx) in enumerate(duplicates):
                    matches[d_split][d_idx] = (matches[d_split][d_idx][0], [matches[d_split][d_idx][1][i+1]])
                    #print(matches[d_split][d_idx][1], "->", [matches[d_split][d_idx][1][i]])

            sample.update({'titles_supervised': merge_sample['titles'], 'raw_titles_supervised': merge_sample['raw_titles'], 'gender_supervised': merge_sample['gender'], 
                           'auto_title': merge_sample['auto_title'], 'auto_raw_title': merge_sample['auto_raw_title'],  'path': merge_sample['path'], 'URI': merge_sample['URI'], 
                           'review': merge_sample['review'], 'valid': merge_sample['valid'], 'style_valid': merge_sample['style_valid'], 'raw': merge_sample['raw'], 
                           'bio': merge_sample['bio'], 'name': merge_sample['name'], 'duplicates': duplicates})
            merged_dataset[part].append(sample)

        else:
            match_id = match[1][0]
            merge_sample = labeled_data[match_id]
            sample.update({'titles_supervised': merge_sample['titles'], 'raw_titles_supervised': merge_sample['raw_titles'], 'gender_supervised': merge_sample['gender'], 
                           'auto_title': merge_sample['auto_title'], 'auto_raw_title': merge_sample['auto_raw_title'],  'path': merge_sample['path'], 'URI': merge_sample['URI'], 
                           'review': merge_sample['review'], 'valid': merge_sample['valid'], 'style_valid': merge_sample['style_valid'], 'raw': merge_sample['raw'], 
                           'bio': merge_sample['bio'], 'name': merge_sample['name'], 'duplicates': []})
            merged_dataset[part].append(sample)



In [None]:
for i, sample in enumerate(merged_dataset[part]):
    if len(sample['duplicates']) > 0:
        for (split, idx) in sample['duplicates']:
            if len(merged_dataset[split][idx]['duplicates']) == 0:
                dup = [tup for tup in sample['duplicates'] if not tup == (split,idx)]
                dup.append((part, i))
                merged_dataset[split][idx]['duplicates'] = dup

In [None]:
with open('data/huggingface_merge_complete.pkl', 'wb') as handle:
    pickle.dump(merged_dataset, handle)

In [None]:
public_data = {part: [] for part in partitions}
keys_to_copy = ['text_duplicates', 'titles_supervised', 'raw_titles_supervised', 'gender_supervised', 'auto_title', 'auto_raw_title', 'review', 'valid', 'style_valid', 'duplicates']

for part in partitions:
    for entry in merged_dataset[part]:
        reduced_entry = {k: entry[k] for k in keys_to_copy}
        public_data[part].append(reduced_entry)
        
with open('data/huggingface_patch.pkl', 'wb') as handle:
    pickle.dump(public_data, handle)