In [None]:
import pickle
import numpy as np
import os
import pandas as pd
import math
import json
from tqdm import tqdm
from embedding import BertHuggingface
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

import torch
import transformers

sns.set(font_scale=2)

## Get the data and (new) classes

For the reviewed data all matching raw titles were assigned. In the TITLE_JSON these raw titles are sorted by classes, including potential new classes marked with *
It might make sense to merge these with some existing classes.

In [None]:
BIOS_FILE_REVIEWED = "data/BIOS_REVIEWED.pkl"
TITLE_JSON = "data/title_lookup.json"

reviewed_classes = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'photographer', 'physician', 'attorney', 'journalist']

In [None]:
with (open(BIOS_FILE_REVIEWED, "rb")) as openfile:
    full_data = raw_data = pickle.load(openfile)

with open(TITLE_JSON, 'r') as j:
    raw_titles_per_title = json.load(j)
    
classes = [key for key in raw_titles_per_title.keys() if not '-' in key]
        
print(classes)

In [None]:
# lookup for titles given raw titles
raw_title_lookup = {}
for title, raw_titles in raw_titles_per_title.items():
    if title[0] == '-':
        continue
    for rt in raw_titles:
        if not rt in raw_title_lookup.keys():
            raw_title_lookup.update({rt: [title]})
        else:
            raw_title_lookup[rt].append(title)

In [None]:
# 'auto_raw_title' and 'auto_title' refer to the titles automatically assigned by the crawler/preprocessing
# 'raw_titles' is a list of raw titles that were explicitly labeled during review (only availalbe for reviewed, valid samples)
# 'titles' refers to the classes derived from the 'raw_titles' (multi-class!, only available for reviewed, valid samples)
# 'review' is 1 if the sample was reviewed
# if a sample was reviewed, the 'valid' flag shows if it is a valid biography*, if not other informations might be missing!
# 'style_valid' shows if a valid biography also matches the style (as opposed to texts that contain the information for our classification task but are not actual biographies)
# 'raw_edited' contains the edited raw text (if any changes happened during review) - this also includes some changes due to formatting issues and otherwise mostly removal of few words at the end (e.g. "Read more...")
# 'comment' contains either the comment automatically generated when filtering the dataset or the reviewers comment (usually when the datapoint is invalid)

# * in the broadest sense, e.g. refering to an actual person, containing useful information after the first sentence

full_data[0].keys()

## Results of the review

In [None]:
data_rjobs = [entry for entry in full_data if entry['auto_title'] in reviewed_classes]
print("got", len(data_rjobs), "samples for the jobs that were reviewed")

data_reviewed = [entry for entry in data_rjobs if entry['review'] == 1]
data_not_reviewed = [entry for entry in data_rjobs if entry['review'] == 0]
print("of those ", len(data_reviewed), "have been reviewed")

data_valid = [entry for entry in data_reviewed if entry['valid'] == 1]
print("of those ", len(data_valid), "are valid")

data_style_valid = [entry for entry in data_reviewed if entry['style_valid'] == 1]
data_style_invalid = [entry for entry in data_reviewed if entry['style_valid'] == 0]
print("of those ", len(data_style_valid), " are stylistically valid, ", len(data_style_invalid), " are stylistically invalid")

In [None]:
data_no_title = [entry for entry in data_valid if entry['titles'] == '' or (type(entry['titles']) == list and len(entry['titles']) == 0)]
data_one_title = [entry for entry in data_valid if type(entry['titles']) == list and len(entry['titles']) == 1]
data_wrong_title = [entry for entry in data_valid if type(entry['titles']) == list and entry['auto_title'] not in entry['titles']]
data_multi_title = [entry for entry in data_valid if type(entry['titles']) == list and len(entry['titles']) > 1]

print("% without any title: ", len(data_no_title)/len(data_valid))
print("% with exactly one title: ", len(data_one_title)/len(data_valid))
print("% where the auto title was wrong: ", len(data_wrong_title)/len(data_valid))
print("% with multiple titles: ", len(data_multi_title)/len(data_valid))

In [None]:
review_res = {}
# review statistics per job (auto label)
for job in reviewed_classes:
    reviewed = [entry for entry in data_reviewed if job == entry['auto_title']]
    if len(reviewed) == 0:
        print("no reviewed samples for ", job)
        continue
        
    valid = [entry for entry in data_valid if job == entry['auto_title']]
    review_res.update({job: {}})
    review_res[job]['samples (total)'] = int(len([entry for entry in data_rjobs if entry['auto_title'] == job]))
    review_res[job]['samples (reviewed)'] = len(reviewed)
    review_res[job]['samples* (reviewed)'] = len([entry for entry in data_reviewed if job in entry['titles']])
    review_res[job]['% valid'] = len(valid)/len(reviewed)
    review_res[job]['% style valid'] = len([entry for entry in data_style_valid if job == entry['auto_title']])/len(valid)
    review_res[job]['% one title'] = len([entry for entry in data_one_title if job == entry['auto_title']])/len(valid)
    review_res[job]['% wrong title'] = len([entry for entry in data_wrong_title if job == entry['auto_title']])/len(valid)
    review_res[job]['% multiple titles'] = len([entry for entry in data_multi_title if job == entry['auto_title']])/len(valid)

#inv_review = {k: {job: review_res[job][k]} for k in review_res['nurse'].keys() for job in review_res.keys()}
review_df = pd.DataFrame(data=review_res)
review_df

In [None]:
def df_to_latex(df):
    rows = df.index.values
    cols = df.columns
    header = ""
    for col in cols:
        header += " & "+col
    header += "\\\\"
    print(header)
    for i in range(1, len(rows)):
        row = rows[i]
        for col in cols:
            val = df.iloc[i][col]
            if int(val) == val:
                val = int(val)
            else:
                val = float(int(val*1000))/1000
            row += " & " + str(val)
        row += "\\\\"
        print(row)
df_to_latex(review_df.loc[:, ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist']])
print()
print()
df_to_latex(review_df.loc[:, ['nurse', 'photographer', 'physician', 'attorney', 'journalist']])

## Visualizations

### Samples and gender ratio per class

In [None]:
reviewed_plus_classes = reviewed_classes + ['*software architect', '*writer', '*researcher']

# unsupervised: all samples, auto-label
samples_auto = [len([sample for sample in data_reviewed if sample['auto_title'] == c]) for c in reviewed_classes]
ratio_auto = [len([sample for sample in data_reviewed if sample['auto_title'] == c and sample['gender'] == 'F'])/samples_auto[i] for i, c in enumerate(reviewed_classes)]

# reviewed: valid samples, annotated labels
samples_review = [len([sample for sample in data_valid if c in sample['titles']]) for c in reviewed_plus_classes]
ratio_review = [len([sample for sample in data_valid if c in sample['titles'] and sample['gender'] == 'F'])/samples_review[i] for i, c in enumerate(reviewed_plus_classes)]

sorted_classes = reviewed_classes

for i, c in enumerate(sorted_classes):
    print(c, samples_review[i])

In [None]:
# try this for better layout (no overlapping labels, labels not being cut when saves?)
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

# sort classes by no. samples (auto labels)
n = len(reviewed_classes)
sorted_classes = [x for _, x in sorted(zip(samples_auto, sorted_classes))]+reviewed_plus_classes[n:]
ratio_auto = [x for _, x in sorted(zip(samples_auto, ratio_auto))]
samples_review = [x for _, x in sorted(zip(samples_auto, samples_review[:n]))]+samples_review[n:]
ratio_review = [x for _, x in sorted(zip(samples_auto, ratio_review[:n]))]+ratio_review[n:]
samples_auto = sorted(samples_auto)


x = np.arange(len(reviewed_classes))
x2 = np.arange(len(reviewed_plus_classes))

width = 0.35
offset = width/2

fig, axes = plt.subplots(1,2, figsize=(20,10))
rects = axes[0].bar(x - offset, samples_auto, width, label="Unsupervised")
rects2 = axes[0].bar(x2 + offset, samples_review, width, label="Reviewed")
axes[0].set_xticks(x2)
axes[0].set_xticklabels(sorted_classes, rotation=90)
axes[0].set_title("Samples per class")
axes[0].legend(loc="upper left")

rects = axes[1].bar(x - offset, ratio_auto, width, label="Unsupervised")
rects2 = axes[1].bar(x2 + offset, ratio_review, width, label="Reviewed")
axes[1].set_xticks(x2)
axes[1].set_xticklabels(sorted_classes, rotation=90)
axes[1].set_title("Female ratio per class")
axes[1].legend(loc="upper right")

plt.savefig('plots/samples_ratio.eps', format='eps', bbox_inches = "tight")
plt.show()

In [None]:
# try this for better layout (no overlapping labels, labels not being cut when saves?)
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

# sort classes by no. samples (auto labels)
n = len(reviewed_classes)
sorted_classes = [x for _, x in sorted(zip(samples_auto, sorted_classes))]+reviewed_plus_classes[n:]
ratio_auto = [x for _, x in sorted(zip(samples_auto, ratio_auto))]
samples_review = [x for _, x in sorted(zip(samples_auto, samples_review[:n]))]+samples_review[n:]
ratio_review = [x for _, x in sorted(zip(samples_auto, ratio_review[:n]))]+ratio_review[n:]
samples_auto = sorted(samples_auto)


x = np.arange(len(reviewed_classes))
x2 = np.arange(len(reviewed_plus_classes))

width = 0.35
offset = width/2

fig, axes = plt.subplots(figsize=(10,10))
rects = axes.bar(x - offset, samples_auto, width, label="Unsupervised")
rects2 = axes.bar(x2 + offset, samples_review, width, label="Reviewed")
axes.set_xticks(x2)
axes.set_xticklabels(sorted_classes, rotation=90)
axes.set_title("Samples per class")
axes.legend(loc="upper left")
plt.savefig('plots/samples.eps', format='eps', bbox_inches = "tight")
plt.show()

fig, axes = plt.subplots(figsize=(10,10))
rects = axes.bar(x - offset, ratio_auto, width, label="Unsupervised")
rects2 = axes.bar(x2 + offset, ratio_review, width, label="Reviewed")
axes.set_xticks(x2)
axes.set_xticklabels(sorted_classes, rotation=90)
axes.set_title("Female ratio per class")
axes.legend(loc="upper right")

plt.savefig('plots/ratio.eps', format='eps', bbox_inches = "tight")
plt.show()

### Auto-labels vs. annotated labels

In [None]:
def get_cm(gender):
    cm = np.zeros((len(reviewed_classes), len(reviewed_plus_classes)+1))
    count = np.zeros(len(reviewed_classes))

    for sample in data_valid:
        if not sample['gender'] == gender:
            continue
        cid = reviewed_classes.index(sample['auto_title'])
        new_ids = [reviewed_plus_classes.index(c) if c in reviewed_plus_classes else 13 for c in sample['titles']]
        for idx in new_ids:
            cm[cid, idx] += 1
        count[cid] += 1

    for i in range(len(reviewed_classes)):
        cm[i,:] /= count[i]
    return cm

cm_M = get_cm('M')
cm_F = get_cm('F')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(30,10))
sns.heatmap(cm_M, annot=True, xticklabels=reviewed_plus_classes+['other'], yticklabels=reviewed_classes, ax=axes[0], fmt=".2f", annot_kws={'fontsize': 15})
sns.heatmap(cm_F, annot=True, xticklabels=reviewed_plus_classes+['other'], yticklabels=reviewed_classes, ax=axes[1], fmt=".2f", annot_kws={'fontsize': 15})
axes[0].set_title('Male')
axes[1].set_title('Female')

plt.savefig('plots/cm_reviewed_label.eps', format='eps', bbox_inches = "tight")
plt.show()

### Label co-occurence

In [None]:
def get_cooccurence(gender):
    cm = np.zeros((len(reviewed_plus_classes), len(reviewed_plus_classes)))

    for sample in data_valid:
        if not sample['gender'] == gender:
            continue
        new_ids = [reviewed_plus_classes.index(c) for c in sample['titles'] if c in reviewed_plus_classes]
        for idx in new_ids:
            for idx2 in new_ids:
                cm[idx, idx2] += 1

    for i in range(len(reviewed_plus_classes)):
        cm[i,:] /= cm[i,i]

    return cm

cm_M = get_cooccurence('M')
cm_F = get_cooccurence('F')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(30,10))
sns.heatmap(cm_M, annot=True, xticklabels=reviewed_plus_classes, yticklabels=reviewed_plus_classes, ax=axes[0], fmt=".2f", annot_kws={'fontsize': 15})
sns.heatmap(cm_F, annot=True, xticklabels=reviewed_plus_classes, yticklabels=reviewed_plus_classes, ax=axes[1], fmt=".2f", annot_kws={'fontsize': 15})
axes[0].set_title('Male')
axes[1].set_title('Female')

plt.savefig('plots/label_cooccurence.eps', format='eps', bbox_inches = "tight")
plt.show()

## Preparing the data for classification

Please check for any stupid mistakes

In [None]:
def prepare_data_one_label(data, classes, use_raw=True, use_review=True):
    X = []
    y = []
    gender = []
    
    for sample in data:
        if use_review:
            if not sample['valid']:
                continue
            labels = []
            for title in sample['titles']:
                if title in classes:
                    labels.append(classes.index(title))
            if not len(labels) == 1: # ignore any multi-class samples or those with no class of interest
                continue
                
            if use_raw:
                X.append(sample['raw'][sample['start_pos']:])
            else:
                X.append(sample['bio'])
            y.append(labels[0])
            gender.append(sample['gender'])
            
            
        else:
            if sample['auto_title'] not in classes:
                continue
            
            if use_raw:
                X.append(sample['raw'][sample['start_pos']:])
            else:
                X.append(sample['bio'])
            y.append(classes.index(sample['auto_title']))
            gender.append(sample['gender'])
    
    return X, y, gender

# this one only applies for reviewed data
def prepare_data_multi_label(data, classes, use_raw=True):
    # TODO: if titles/raw titles not set derive them from the available raw titles and the title lookup
    X = []
    y = []
    gender = []
    
    for sample in data:
        if not sample['valid']:
            continue
        labels = []
        for title in sample['titles']:
            if title in classes:
                labels.append(classes.index(title))
        if len(labels) == 0: # there might be a few samples that don't fit into the reviewed classes anymore
            continue

        # create multi class label
        lbl = np.array([1 if i in labels else 0 for i in range(len(classes))])

        if use_raw:
            X.append(sample['raw'][sample['start_pos']:])
        else:
            X.append(sample['bio'])
        y.append(lbl)
        gender.append(sample['gender'])
    
    return X, y, gender

In [None]:
def GAP(y_true, y_pred, group):
    y_true_F = [y_true[i] for i in range(len(group)) if group[i] == 'F']
    y_true_M = [y_true[i] for i in range(len(group)) if group[i] == 'M']
    y_pred_F = [y_pred[i] for i in range(len(group)) if group[i] == 'F']
    y_pred_M = [y_pred[i] for i in range(len(group)) if group[i] == 'M']

    TP_F = [1 if y_true_F[i] == y_pred_F[i] else 0 for i in range(len(y_true_F))]
    TP_F = sum(TP_F)/len(TP_F)

    TP_M = [1 if y_true_M[i] == y_pred_M[i] else 0 for i in range(len(y_true_M))]
    TP_M = sum(TP_M)/len(TP_M)

    GAPS = TP_F - TP_M
    return GAPS

def GAP_binary(y_true, y_pred, group):
    
    y_true_F = [y_true[i] for i in range(len(group)) if group[i] == 'F']
    y_true_M = [y_true[i] for i in range(len(group)) if group[i] == 'M']
    y_pred_F = [y_pred[i] for i in range(len(group)) if group[i] == 'F']
    y_pred_M = [y_pred[i] for i in range(len(group)) if group[i] == 'M']

    TP_F = [1 if y_true_F[i] == 1 and y_pred_F[i] == 1 else 0 for i in range(len(y_true_F))]
    TP_F = sum(TP_F)/len(TP_F)

    TP_M = [1 if y_true_M[i] == 1 and y_pred_M[i] == 1 else 0 for i in range(len(y_true_M))]
    TP_M = sum(TP_M)/len(TP_M)

    GAPS = TP_F - TP_M
    return GAPS

def GAP_per_class(y_true, y_pred, group):
    n_classes = max(y_true)+1
    
    y_true_F = [y_true[i] for i in range(len(group)) if group[i] == 'F']
    y_true_M = [y_true[i] for i in range(len(group)) if group[i] == 'M']
    y_pred_F = [y_pred[i] for i in range(len(group)) if group[i] == 'F']
    y_pred_M = [y_pred[i] for i in range(len(group)) if group[i] == 'M']

    TP_F = np.zeros(n_classes)
    sum_F = np.zeros(n_classes)
    for i in range(len(y_true_F)):
        if y_true_F[i] == y_pred_F[i]:
            TP_F[y_true_F[i]] += 1
        sum_F[y_true_F[i]] += 1
    TP_F /= sum_F

    TP_M = np.zeros(n_classes)
    sum_M = np.zeros(n_classes)
    for i in range(len(y_true_M)):
        if y_true_M[i] == y_pred_M[i]:
            TP_M[y_true_M[i]] += 1
        sum_M[y_true_M[i]] += 1
    TP_M /= sum_M
    
    GAPS = TP_F - TP_M
    return GAPS

In [None]:
ITERATIONS = 5
EPOCHS = 2
BATCH_SIZE = 8
NUM_CLASSES = len(reviewed_classes)

In [None]:
def gender_ratio_per_class(y, gender, ref_gender='F'):
    assert len(y) == len(gender)
    
    if type(y[0]) == np.ndarray: # one hot encoded label
        return gender_ratio_multi_class(y, gender, ref_gender)
    
    # just one label per sample
    ratios = []
    for c in range(max(y)+1):
        gender_c = [gender[i] for i in range(len(gender)) if y[i] == c]
        
        ratio = gender_c.count(ref_gender)/len(gender_c)
        #print(c, ratio)
        ratios.append(ratio)
    return ratios

def gender_ratio_multi_class(y, gender, ref_gender='F'):
    ratios = []
    for c in range(len(y[0])):
        gender_c = [gender[i] for i in range(len(gender)) if y[i][c] == 1]
        
        ratio = gender_c.count(ref_gender)/len(gender_c)
        #print(c, ratio)
        ratios.append(ratio)
    return ratios

In [None]:
def print_results(res):
    (mean_f1s, std_f1s), (mean_accs, std_accs), (mean_gaps, std_gaps), (mean_gaps_per_class, std_gaps_per_class), (mean_gender_ratio, std_gender_ratio), (mean_precision, std_precision), (mean_recall, std_recall) = res
    print("F1: ", mean_f1s, "+/-", std_f1s)
    print("Precision: ", mean_precision, "+/-", std_precision)
    print("Recall: ", mean_recall, "+/-", std_recall)
    print("Acc: ", mean_accs, "+/-", std_accs)

    print("GAP: ", mean_gaps, "+/-", std_gaps)
    print("GAP per class: ", mean_gaps_per_class, "+/-", std_gaps_per_class)

def train_and_evaluate(init_model, X, y, gender, save_dir, multi_label=False):
    f1s = []
    precs = []
    recs = []
    accs = []
    gaps = []
    gaps_per_class = []
    gender_ratios = []
    cms_M = []
    cms_F = []
    
    X, y, gender, = shuffle(X, y, gender)
    
    #set up n fold data    
    n_folds = ITERATIONS
    X_train_folds = np.array_split(X, n_folds)
    y_train_folds = np.array_split(y, n_folds)
    gender_train_folds = np.array_split(gender, n_folds)
    
    # check that any test set includes all class/gender combiatons
    incomplete = False
    for fold in range(n_folds):
        y_test = y_train_folds[fold]
        gender_test = gender_train_folds[fold]
        incomplete_classes = []
        if multi_label:
            n_classes = len(y_test[0])
            for c in range(n_classes):
                gender_sel = [gender_test[i] for i in range(len(y_test)) if y_test[i][c] == 1]
                if not 'M' in gender_sel or not 'F' in gender_sel:
                    incomplete_classes.append(c)
        else:
            n_classes = max(y_test)+1
            for c in range(n_classes):
                gender_sel = [gender_test[i] for i in range(len(y_test)) if y_test[i] == c]
                if not 'M' in gender_sel or not 'F' in gender_sel:
                    incomplete_classes.append(c)
                    
        if len(incomplete_classes) > 0:
            print("fold ", fold, " does not contain M or F samples for classes:", incomplete_classes)
            incomplete = True
            
    if incomplete:
        return
        

    for i in range(n_folds):
        # set up filenames to save the data/model
        model_dir = save_dir+str(i)+'/model/'
        pred_file = save_dir+str(i)+'/predictions.pickle'
        data_file = save_dir+str(i)+'/data.pickle'
        
        # get pretrained model
        bert = init_model
        
        if not os.path.isdir(save_dir+str(i)):
            os.makedirs(save_dir+str(i))
        
        if os.path.isfile(data_file):
            with open(data_file, 'rb') as handle:
                data_split = pickle.load(handle)
                X_train = data_split['X_train']
                X_test = data_split['X_test']
                y_train = data_split['y_train']
                y_test = data_split['y_test']
                gender_train = data_split['gender_train']
                gender_test = data_split['gender_test']
                
                # load existing predictions if possible
                if os.path.isfile(pred_file):  
                    print('Loading predictions pickle: ', pred_file)
                    with open(pred_file, 'rb') as handle:
                        y_pred = pickle.load(handle)
                else:
                    # train/ load model
                    if os.path.isdir(model_dir):  
                        print('Loading model from: ', model_dir)
                        bert.load(model_dir)
                    else:

                        print('Retrain model...')
                        bert.retrain(X_train, y_train, epochs=EPOCHS)
                        print('Save model at: ', model_dir)
                        bert.save(model_dir)
                    
                    # predict
                    if multi_label:
                        y_pred, y_idx = bert.predict(X_test, np.arange(len(y_test)))
                        y_test = [y_test[i] for i in y_idx]
                        gender_true = [gender_test[i] for i in y_idx]
                        
                        y_pred = np.array(y_pred) >= 0.5
                
                        # save y_test and gender_test according to shuffled test samples
                        with open(data_file, 'wb') as handle:
                            pickle.dump({'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'gender_train': gender_train, 'gender_test': gender_test}, handle)
                    else:
                        pred = bert.predict(X_test)
                        y_pred = np.argmax(pred, axis=1)
                        
                    with open(pred_file, 'wb') as handle:
                        pickle.dump(y_pred, handle)
                        
        else: # data split has not been saved
            #X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(X, y, gender, test_size=0.33, random_state=i)
            #set up training data
            X_train = np.hstack(np.delete(X_train_folds,i, axis=0)).tolist()
            if multi_label:
                y_train = np.vstack(np.delete(y_train_folds,i, axis=0)).tolist()
            else:
                y_train = np.hstack(np.delete(y_train_folds,i, axis=0)).tolist()
            gender_train = np.hstack(np.delete(gender_train_folds,i, axis=0)).tolist()
            
            #set up testing data
            X_test = X_train_folds[i].tolist()
            y_test = y_train_folds[i].tolist()
            gender_test = gender_train_folds[i].tolist()
            
            with open(data_file, 'wb') as handle:
                pickle.dump({'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'gender_train': gender_train, 'gender_test': gender_test}, handle)
                
            # if data wasn't saved, we need to train again
            print('Retrain model...')
            bert.retrain(X_train, y_train, epochs=EPOCHS)
            print('Save model at: ', model_dir)
            bert.save(model_dir)
            
            # and also predict again
            if multi_label:
                y_pred, y_idx = bert.predict(X_test, np.arange(len(y_test)))
                y_test = [y_test[i] for i in y_idx]
                gender_test = [gender_test[i] for i in y_idx]
                
                y_pred = np.array(y_pred) >= 0.5
                
                # save y_test and gender_test according to shuffled test samples
                with open(data_file, 'wb') as handle:
                    pickle.dump({'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'gender_train': gender_train, 'gender_test': gender_test}, handle)
            else:
                pred = bert.predict(X_test)
                y_pred = np.argmax(pred, axis=1)
            with open(pred_file, 'wb') as handle:
                pickle.dump(y_pred, handle)
                
        if multi_label:
            y_true = np.asarray(y_test)
            
            class_weights = (np.sum(y_test, axis=0)/np.sum(y_test))
            f1s.append(np.sum([f1_score(y_true[:,c], y_pred[:,c], average='macro')*class_weights[c] for c in range(len(y_test[0]))]))
            precs.append(np.sum([precision_score(y_true[:,c], y_pred[:,c], average='macro')*class_weights[c] for c in range(len(y_test[0]))]))
            recs.append(np.sum([recall_score(y_true[:,c], y_pred[:,c], average='macro')*class_weights[c] for c in range(len(y_test[0]))]))
            accs.append([accuracy_score(y_true[:,c], y_pred[:,c]) for c in range(len(y_test[0]))])
            class_gaps = [GAP_binary(y_true[:,c], y_pred[:,c], gender_test) for c in range(len(y_test[0]))]
            gaps_per_class.append(class_gaps)
            gaps.append(np.mean(np.abs(class_gaps)))
            
        else:
            # default scores for one-label classification
            f1s.append(f1_score(y_test, y_pred, average='macro'))
            precs.append(precision_score(y_test, y_pred, average='macro'))
            recs.append(recall_score(y_test, y_pred, average='macro'))
            accs.append(accuracy_score(y_test, y_pred))
            class_gaps = GAP_per_class(y_test, y_pred, gender_test)
            gaps.append(np.mean(np.abs(class_gaps)))
            gaps_per_class.append(class_gaps)
            # confusion matrices for M/F samples
            cms_M.append(confusion_matrix([y for i, y in enumerate(y_test) if gender_test[i] == 'M'], [y for i, y in enumerate(y_pred) if gender_test[i] == 'M']))
            cms_F.append(confusion_matrix([y for i, y in enumerate(y_test) if gender_test[i] == 'F'], [y for i, y in enumerate(y_pred) if gender_test[i] == 'F']))
        
        if multi_label:
            gender_ratios.append(gender_ratio_multi_class(y_train, gender_train))
        else:
            gender_ratios.append(gender_ratio_per_class(y_train, gender_train))
    
    res = (np.mean(f1s), np.std(f1s)), (np.mean(accs), np.std(accs)), (np.mean(gaps), np.std(gaps)), (np.mean(gaps_per_class, axis=0), np.std(gaps_per_class, axis=0)), (np.mean(gender_ratios, axis=0), np.std(gender_ratios, axis=0)), (np.mean(precs, axis=0), np.std(precs, axis=0)), (np.mean(recs, axis=0), np.std(recs, axis=0))
    print_results(res)
    
    return res, cms_M, cms_F


In [None]:
def plot_confusion_matrix(cms_M, cms_F, classes, filename):
    mean_cm_M = np.mean(cms_M, axis=0)
    mean_cm_F = np.mean(cms_F, axis=0)
    for i in range(len(classes)):
        mean_cm_M[i,:] /= sum(mean_cm_M[i,:])
        mean_cm_F[i,:] /= sum(mean_cm_F[i,:])

    fig, axes = plt.subplots(1,2, figsize=(30,10))
    sns.heatmap(mean_cm_M, annot=True, xticklabels=classes, yticklabels=classes, ax=axes[0], fmt=".2f", annot_kws={'fontsize': 15})
    sns.heatmap(mean_cm_F, annot=True, xticklabels=classes, yticklabels=classes, ax=axes[1], fmt=".2f", annot_kws={'fontsize': 15})
    axes[0].set_title('Male')
    axes[1].set_title('Female')

    plt.savefig(filename)
    plt.show()

In [None]:
results = {}

## Unsupervised data (raw texts)

In [None]:
X, y, gender = prepare_data_one_label(data_reviewed, reviewed_classes, use_raw=True, use_review=False)
print(len(X))

bert = BertHuggingface(NUM_CLASSES, batch_size=BATCH_SIZE)
results['unsupervised-raw'], cms_M, cms_F = train_and_evaluate(bert, X, y, gender, 'results/unsupervised_raw/', multi_label=False)

In [None]:
plot_confusion_matrix(cms_M, cms_F, reviewed_classes, 'plots/cm_UR.png')

F1:  0.7825470346290064 +/- 0.0

Acc:  0.7929125138427464 +/- 0.0

GAP:  0.04386221480895913 +/- 0.0

GAP per class:  0.02529047354879495 +/- 0.11668768674783181

## Unsupervised data (gender-scrubbed)

In [None]:
X, y, gender = prepare_data_one_label(data_reviewed, reviewed_classes, use_raw=False, use_review=False)
print(len(X))

bert = BertHuggingface(NUM_CLASSES, batch_size=BATCH_SIZE)
results['unsupervised-scrubbed'], cms_M, cms_F = train_and_evaluate(bert, X, y, gender, 'results/unsupervised_scrubbed/', multi_label=False)

In [None]:
plot_confusion_matrix(cms_M, cms_F, reviewed_classes, 'plots/cm_US.png')

## Reviewed data (one label, raw texts)

In [None]:
reviewed_plus_classes = reviewed_classes + ['*software architect']
X, y, gender = prepare_data_one_label(data_reviewed, reviewed_plus_classes, use_raw=True, use_review=True)
print(len(X))

bert = BertHuggingface(len(reviewed_plus_classes), batch_size=BATCH_SIZE)
results['reviewed-single-raw'], cms_M, cms_F = train_and_evaluate(bert, X, y, gender, 'results/reviewed_single_raw/', multi_label=False)

In [None]:
plot_confusion_matrix(cms_M, cms_F, reviewed_plus_classes, 'plots/cm_RR.png')

F1:  0.8544357417766737 +/- 0.0
    
Acc:  0.8741652021089632 +/- 1.1102230246251565e-16
    
GAP:  0.03245962385229917 +/- 0.0
    
GAP per class:  -0.018746219284855353 +/- 0.1109400479883512

## Reviewed data (one label, gender-scrubbed)

In [None]:
reviewed_plus_classes = reviewed_classes + ['*software architect']
X, y, gender = prepare_data_one_label(data_reviewed, reviewed_plus_classes, use_raw=False, use_review=True)
print(len(X))

bert = BertHuggingface(len(reviewed_plus_classes), batch_size=BATCH_SIZE)
results['reviewed-single-scrubbed'], cms_M, cms_F = train_and_evaluate(bert, X, y, gender, 'results/reviewed_single_scrubbed/', multi_label=False)

In [None]:
plot_confusion_matrix(cms_M, cms_F, reviewed_plus_classes, 'plots/cm_RS.png')

F1:  0.8581141473983337 +/- 1.1102230246251565e-16

Acc:  0.8773286467486819 +/- 0.0

GAP:  0.03647492855708645 +/- 0.0

GAP per class:  0.00790687262458947 +/- 0.09756611854625045

## BERT for multi-label classification

In [None]:
class BertDataset(torch.utils.data.Dataset):

    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.data.items()}

    def __len__(self):
        return len(self.data.input_ids)

class BERTClass(torch.nn.Module):
    def __init__(self, num_classes, batch_size=8, bert_model='bert-base-uncased'):
        super(BERTClass, self).__init__()
        self.batch_size = batch_size
        self.tokenizer = transformers.BertTokenizer.from_pretrained(bert_model)
        self.model = transformers.BertModel.from_pretrained(bert_model)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, num_classes)
        
        self.optimizer = torch.optim.AdamW(params = self.model.parameters(), lr=1e-5)
    
    def forward(self, input_ids, attention_mask):
        output_1 = self.model(input_ids, attention_mask = attention_mask)
        #print(output_1)
        pooled_output = output_1[1]
        output_2 = self.l2(pooled_output)
        output = self.l3(output_2)
        return output
    
    def save(self, save_dir):
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        file = save_dir+"multi_bert.pickle"
        with open(file, 'wb') as handle:
            pickle.dump({'model': self.model, 'l2': self.l2, 'l3': self.l3}, handle)
            
    def load(self, save_dir):
        file = save_dir+"multi_bert.pickle"
        with open(file, 'rb') as handle:
            save_data = pickle.load(handle)
            self.model = save_data['model']
            self.l2 = save_data['l2']
            self.l3 = save_data['l3']
    
    def to(self, device):
        self.model.to(device)
        self.l2.to(device)
        self.l3.to(device)
    
    def predict(self, texts, idx):
        y_pred = []
        indeces = []
        
        # prepare dataset
        inputs = self.tokenizer(texts, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        inputs['sample_ids'] = torch.tensor(idx)
        dataset = BertDataset(inputs)
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        device = 'cpu'
        if torch.cuda.is_available():
            device = 'cuda'
        self.to(device)
        
        loop = tqdm(loader, leave=True)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = self.forward(input_ids, attention_mask=attention_mask)

            outputs = outputs.to('cpu')
            outputs = outputs.detach().numpy()
            y_pred.append(outputs)
            indeces.append(batch['sample_ids'])

            input_ids = input_ids.to('cpu')
            attention_mask = attention_mask.to('cpu')
            del input_ids
            del attention_mask
            del outputs
            torch.cuda.empty_cache()
                
        y_pred = np.vstack(y_pred)
        indeces = np.hstack(indeces)
        
        return y_pred, indeces
        
    def retrain(self, texts, labels, epochs):
        # prepare dataset
        inputs = self.tokenizer(texts, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        inputs['labels'] = torch.tensor(labels)
        dataset = BertDataset(inputs)
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        device = 'cpu'
        if torch.cuda.is_available():
            device = 'cuda'
        self.model.train()
        self.to(device)
        optimizer = transformers.AdamW(self.model.parameters(), lr=1e-5)

        for epoch in range(epochs):
            loop = tqdm(loader, leave=True)
            for batch in loop:
                # initialize calculated gradients (from prev step)
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                # process
                outputs = self.forward(input_ids, attention_mask=attention_mask)

                # extract loss
                loss = loss_fn(outputs, labels)

                outputs.to('cpu')

                # calculate loss for every parameter that needs grad update
                loss.backward()

                # update parameters
                optimizer.step()

                # print relevant info to progress bar
                loop.set_description(f'Epoch {epoch}')
                loop.set_postfix(loss=loss.item())

                loss = loss.detach().item()
                
                input_ids = input_ids.to('cpu')
                attention_mask = attention_mask.to('cpu')
                labels = labels.to('cpu')
                del input_ids
                del attention_mask
                del labels

        self.model.eval()
        torch.cuda.empty_cache()
    
def loss_fn(outputs, targets):
    outputs = outputs.float()
    targets = targets.float()
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


## Reviewed data (multi-label, raw texts)

In [None]:
reviewed_plus_classes = reviewed_classes + ['*software architect', '*writer', '*researcher']
X, y, gender = prepare_data_multi_label(data_reviewed, reviewed_plus_classes, use_raw=True)
print(len(X))

bert = BERTClass(len(reviewed_plus_classes), batch_size=BATCH_SIZE)
results['reviewed-multi-raw'], _, _ = train_and_evaluate(bert, X, y, gender, 'results/reviewed_multi_raw/', multi_label=True)

## Reviewed data (multi-label, gender-scrubbed)

In [None]:
reviewed_plus_classes = reviewed_classes + ['*software architect', '*writer', '*researcher']
X, y, gender = prepare_data_multi_label(data_reviewed, reviewed_plus_classes, use_raw=False)
print(len(X))

bert = BERTClass(len(reviewed_plus_classes), batch_size=BATCH_SIZE)
results['reviewed-multi-scrubbed'], _, _ = train_and_evaluate(bert, X, y, gender, 'results/reviewed_multi_scrubbed/', multi_label=True)

## Visualize results

In [None]:
results.keys()

In [None]:
f1 = []
f1_err = []
prec = []
prec_err = []
rec = []
rec_err = []
acc = []
acc_err = []
gap = []
gap_err = []
for key, res in results.items():
    (mean_f1s, std_f1s), (mean_accs, std_accs), (mean_gaps, std_gaps), (mean_gaps_per_class, std_gaps_per_class), (gender_ratio, _), (mean_precision, std_precision), (mean_recall, std_recall) = res
    f1.append(mean_f1s)
    f1_err.append(std_f1s)
    prec.append(mean_precision)
    prec_err.append(std_precision)
    rec.append(mean_recall)
    rec_err.append(std_recall)
    acc.append(mean_accs)
    acc_err.append(std_accs)
    gap.append(mean_gaps)
    gap_err.append(std_gaps)

In [None]:
x = np.arange(len(f1))
width = 0.25
offset = width/2

fig, ax = plt.subplots(figsize=(20,10))
rects2 = ax.bar(x - offset, acc, width, yerr=acc_err, label="Accuracy", color='tab:blue')
ax2 = ax.twinx()

keys = ['unsupervised \n(w)', 'unsupervised \n(w/o)', 'reviewed-\nsingle (w)', 'reviewed-\nsingle (w/o)', 'reviewed-\nmulti (w)', 'reviewed-\nmulti (w/o)']
ax.set_xticks(x)
ax.set_xticklabels(keys, rotation=45)
ax.set_ylabel('Accuracy', color='tab:blue')
ax.set_title('Accuracy and GAP per dataset versions')

rects3 = ax2.bar(x + offset, gap, width, yerr=gap_err, label="GAP", color='tab:green')
ax2.set_ylabel('mean absolute GAP', color='tab:green')

ax.set_ylim(0.0, 1.19)

plt.grid(False)

plt.savefig('plots/acc_gap.eps', format='eps', bbox_inches = "tight")
plt.show()

In [None]:
x = np.arange(len(f1))
width = 0.25
offset = width/2

fig, ax = plt.subplots(figsize=(20,10))
rects = ax.bar(x - offset, f1, width, yerr=f1_err, label="F1")
rects2 = ax.bar(x + offset, prec, width, yerr=acc_err, label="Precison")
rects3 = ax.bar(x + 3*offset, rec, width, yerr=gap_err, label="Recall")

ax.set_xticks(x)
ax.set_xticklabels(results.keys(), rotation=45)
ax.set_ylabel('Score')
ax.set_title('Performance per dataset versions')
ax.legend()

ax.set_ylim(0.0, 1.19)

plt.savefig('plots/f1_prec_rec.eps', format='eps', bbox_inches = "tight")
plt.show()

In [None]:
# class-wise GAPS

width = 0.25  # the width of the bars
multiplier = 0

n_classes = 13

fig, ax = plt.subplots(figsize=(18,12))

plt.grid(True)
for key, res in results.items(): #TODO one plot for each dataset (unsupervised/one-class/multi-class or raw/scrubbed)
    if 'scrubbed' in key:
        continue
    (_, _), (_, _), (_, _), (mean_gaps_per_class, std_gaps_per_class), (gender_ratio, _), (mean_precision, std_precision), (mean_recall, std_recall) = res
    measurement = mean_gaps_per_class[:n_classes]
    print(measurement)
    errors = std_gaps_per_class[:n_classes]
    x = np.arange(len(measurement))
    offset = width * multiplier
    lbl = key.replace('-raw', ' (w)')
    rects = ax.bar(x + offset, measurement, width, yerr=errors, label=lbl)
    multiplier += 1
    
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('GAPs')
ax.set_title('Class-wise GAP scores')
ax.set_xticks(np.arange(len(reviewed_plus_classes)) + width)
ax.set_xticklabels(reviewed_plus_classes, rotation=45)
ax.legend()

plt.savefig('plots/class_gaps.eps', format='eps', bbox_inches = "tight")
plt.show()

In [None]:
# class-wise GAPS

width = 0.25  # the width of the bars
multiplier = 0

n_classes = 13

fig, ax = plt.subplots(figsize=(18,12))

for key, res in results.items(): #TODO one plot for each dataset (unsupervised/one-class/multi-class or raw/scrubbed)
    if 'raw' in key:
        continue
    (_, _), (_, _), (_, _), (mean_gaps_per_class, std_gaps_per_class), (gender_ratio, _), (mean_precision, std_precision), (mean_recall, std_recall) = res
    measurement = mean_gaps_per_class[:n_classes]
    print(measurement)
    errors = std_gaps_per_class[:n_classes]
    x = np.arange(len(measurement))
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, yerr=errors, label=key)
    multiplier += 1
    
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('GAPs')
ax.set_title('Class-wise GAP scores')
ax.set_xticks(np.arange(len(reviewed_plus_classes)) + width)
ax.set_xticklabels(reviewed_plus_classes, rotation=45)
ax.legend()

plt.grid()
plt.savefig('plots/class_gaps.png', bbox_inches = "tight")
plt.show()

In [None]:
# class-wise GAPS

width = 0.2  # the width of the bars
multiplier = 0

n_classes = 13

fig, ax = plt.subplots(figsize=(18,12))

for key, res in results.items(): #TODO one plot for each dataset (unsupervised/one-class/multi-class or raw/scrubbed)
    if 'single' in key:
        continue
    (_, _), (_, _), (_, _), (mean_gaps_per_class, std_gaps_per_class), (gender_ratio, _), (mean_precision, std_precision), (mean_recall, std_recall) = res
    measurement = mean_gaps_per_class[:n_classes]
    print(measurement)
    errors = std_gaps_per_class[:n_classes]
    x = np.arange(len(measurement))
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, yerr=errors, label=key)
    #ax.bar_label(rects, padding=3, fmt='%.2f', fontsize=15)
    multiplier += 1
    
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('GAPs')
ax.set_title('Class-wise GAP scores')
ax.set_xticks(np.arange(len(reviewed_plus_classes)) + width)
ax.set_xticklabels(reviewed_plus_classes, rotation=45)
ax.legend()

plt.savefig('plots/class_gaps.png', bbox_inches = "tight")
plt.show()

In [None]:
# class-wise GAPS

width = 0.25  # the width of the bars
multiplier = 0

n_classes = 13

fig, ax = plt.subplots(figsize=(18,12))

for key, res in results.items(): #TODO one plot for each dataset (unsupervised/one-class/multi-class or raw/scrubbed)
    if 'scrubbed' in key:
        continue
    (_, _), (_, _), (_, _), (_, _), (mean_gender_ratio, std_gender_ratio), (mean_precision, std_precision), (mean_recall, std_recall) = res
    measurement = mean_gender_ratio[:n_classes]
    print(measurement)
    errors = std_gender_ratio[:n_classes]
    x = np.arange(len(measurement))
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, yerr=errors, label=key)
    multiplier += 1
    
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Female Ratio')
ax.set_title('Female Ratio per Occupation')
ax.set_xticks(np.arange(len(reviewed_plus_classes)) + width)
ax.set_xticklabels(reviewed_plus_classes, rotation=45)
ax.legend()

plt.savefig('plots/female_ratio.png', bbox_inches = "tight")
plt.show()

In [None]:
colors = ['blue', 'green']
#fig, axes = plt.subplots(1,3, figsize=(24,8))
gaps = []
ratios = []
hues = []
keys = []
for i, key in enumerate(['unsupervised-', 'reviewed-single-', 'reviewed-multi-']):
    for j, version in enumerate(['raw', 'scrubbed']):
        (_, _), (_, _), (_, _), (mean_gaps_per_class, std_gaps_per_class), (mean_gender_ratio, std_gender_ratio), (mean_precision, std_precision), (mean_recall, std_recall) = results[key+version]
        
        gaps.append(mean_gaps_per_class)
        ratios.append(mean_gender_ratio)
        if version == 'raw':
            hues += ['w']*len(mean_gaps_per_class)
        else:
            hues += ['w/o']*len(mean_gaps_per_class)
        keys += [key[:-1]]*len(mean_gaps_per_class)
        
        print(key+version)
        print("R: ", pearsonr(mean_gender_ratio, mean_gaps_per_class)) 
        
gaps = np.hstack(gaps)
ratios = np.hstack(ratios)
df = pd.DataFrame(data={'GAP (female)': gaps, 'female ratio': ratios, 'gender ind.': hues, 'dataset': keys})

g = sns.FacetGrid(df, col="dataset", hue='gender ind.', palette=colors, height=6, legend_out=False)
g.map_dataframe(sns.regplot, x='female ratio', y='GAP (female)')
g.add_legend()

plt.savefig('plots/correlation.png', bbox_inches="tight")
    
    
plt.show()

In [None]:
results.keys()