In [3]:
import torch
from pathlib import Path

In [4]:
checkpoint_dir = Path(r"logs\Bios_both\both_adv\models")

In [5]:
_epoch_results = torch.load(checkpoint_dir / "checkpoint_epoch18.pth.tar")

In [7]:
_epoch_results.keys()

dict_keys(['epoch', 'epochs_since_improvement', 'loss', 'dev_predictions', 'test_predictions', 'dev_evaluations', 'test_evaluations'])

In [8]:
_epoch_results["dev_evaluations"]

{'rms_TPR': nan,
 'accuracy': 0.8031215772179627,
 'macro_fscore': 0.7283969094156552,
 'micro_fscore': 0.8031215772179627}

In [11]:
# Load Bios dataset
import pandas as pd

data_dir = Path(r"D:\Project\Minding_Imbalance_in_Discriminator_Training\data\bios")

bios_dev_df = pd.read_pickle(data_dir / "bios_dev_df.pkl")
bios_test_df = pd.read_pickle(data_dir / "bios_test_df.pkl")

In [13]:
def return_y_g(my_df):
    my_df = my_df[my_df["economy_label"]!="Unknown"]
    return my_df["profession_class"].to_numpy(), my_df["intersection_class"].to_numpy()

In [15]:
dev_y, dev_g = return_y_g(bios_dev_df)
test_y, test_g = return_y_g(bios_test_df)

In [34]:
import pandas as pd
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
import numpy as np

import json
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

from itertools import combinations
from tqdm import tqdm

from collections import defaultdict 

def confusion_matrix_based_scores(cnf):
    """
    Implementation from https://stackoverflow.com/a/43331484
    """
    FP = cnf.sum(axis=0) - np.diag(cnf) + 1e-5
    FN = cnf.sum(axis=1) - np.diag(cnf) + 1e-5
    TP = np.diag(cnf) + 1e-5
    TN = cnf.sum() - (FP + FN + TP) + 1e-5

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)

    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    return {
        "TPR":TPR,
        "TNR":TNR,
        "PPV":PPV,
        "NPV":NPV,
        "FPR":FPR,
        "FNR":FNR,
        "FDR":FDR,
        "ACC":ACC
    }

def power_mean(series, p):
    if p>50:
        return max(series)
    elif p<50:
        return min(series)
    else:
        total = np.mean(np.power(series, p))
        return np.power(total, 1 / p)

def gap_eval_scores(y_pred, y_true, protected_attribute):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    protected_attribute = np.array(protected_attribute)

    all_scores = {}
    # Overall evaluation
    distinct_labels = [i for i in range(len(set( y_true)))]
    overall_confusion_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=distinct_labels)
    all_scores["overall"] = confusion_matrix_based_scores(overall_confusion_matrix)
    print(all_scores["overall"])

    # Evaluation results for each group
    group_TPR = [None for i in range(len(set(protected_attribute)))]
    for gid in set(protected_attribute):
        group_identifier = (protected_attribute ==gid)
        group_confusion_matrix = confusion_matrix(y_true=y_true[group_identifier], y_pred=y_pred[group_identifier], labels=distinct_labels)
        all_scores[gid] = confusion_matrix_based_scores(group_confusion_matrix)
        print("\n")
        print(gid)
        print(all_scores[gid]["TPR"])
        print("\n")
        # Save the TPR direct to the list 
        group_TPR[gid] = all_scores[gid]["TPR"]
    print(group_TPR)
    TPRs = np.stack(group_TPR, axis = 1)
    # Calculate GAP
    tpr_gaps = TPRs - all_scores["overall"]["TPR"].reshape(-1,1)
    # Sum over gaps of all protected groups within each class
    tpr_gaps = np.sum(abs(tpr_gaps),axis=1)
    # RMS of each class
    rms_tpr_gaps = np.sqrt(np.mean(tpr_gaps**2))

    accuracy = accuracy_score(y_true, y_pred)
    macro_fscore = f1_score(y_true, y_pred, average="macro")
    micro_fscore = f1_score(y_true, y_pred, average="micro")
    
    # return rms_tpr_gaps, (all_scores, group_TPR)
    return {
        "rms_TPR" : rms_tpr_gaps,
        "accuracy" : accuracy,
        "macro_fscore" : macro_fscore,
        "micro_fscore" : micro_fscore,
    }

In [35]:
_epoch_results.keys()

dict_keys(['epoch', 'epochs_since_improvement', 'loss', 'dev_predictions', 'test_predictions', 'dev_evaluations', 'test_evaluations'])

In [36]:
gap_eval_scores(
    y_pred = _epoch_results["dev_predictions"], 
    y_true = dev_y, 
    protected_attribute = dev_g)

{'TPR': array([0.7517241 , 0.69102989, 0.86981566, 0.45238096, 0.61176468,
       0.85815598, 0.90450926, 0.83206102, 0.66666659, 0.83962261,
       0.5       , 0.83816012, 0.797235  , 0.75196849, 0.79824559,
       0.5       , 0.53124999, 0.44444447, 0.85625965, 0.89403383,
       0.79999997, 0.84779951, 0.72023809, 0.82352922, 0.68181816,
       0.59536082, 0.60267857, 0.79999985]), 'TNR': array([0.99371011, 0.98977006, 0.98741078, 0.99843635, 0.99917211,
       0.99704115, 0.99735325, 0.99796767, 0.9996335 , 0.99664929,
       0.99945004, 0.97878291, 0.99264363, 0.99464012, 0.99496644,
       0.99770852, 0.99770474, 0.99908349, 0.99301581, 0.96633784,
       0.99443465, 0.96369078, 0.9906238 , 0.9989013 , 0.99302844,
       0.99432248, 0.97811191, 0.99880909]), 'PPV': array([0.6158192 , 0.65615141, 0.85600906, 0.69090902, 0.8524589 ,
       0.79084964, 0.92411922, 0.83206102, 0.87499977, 0.83177567,
       0.79310325, 0.69101123, 0.68650792, 0.8721461 , 0.77118642,
       0.47916668

{'rms_TPR': 0.43560272485196483,
 'accuracy': 0.8031215772179627,
 'macro_fscore': 0.7283969094156552,
 'micro_fscore': 0.8031215772179627}

In [37]:
gap_eval_scores(
    y_pred = _epoch_results["test_predictions"], 
    y_true = test_y, 
    protected_attribute = test_g)

{'TPR': array([0.74749999, 0.72881355, 0.87671233, 0.55801104, 0.64516128,
       0.84517765, 0.90275048, 0.78644066, 0.61999998, 0.79405519,
       0.47422681, 0.81013561, 0.80115829, 0.75421863, 0.77079481,
       0.60156248, 0.5510204 , 0.70873782, 0.84479717, 0.89304439,
       0.77685949, 0.85270384, 0.70350609, 0.77586202, 0.69094487,
       0.61857923, 0.63963964, 0.70731704]), 'TNR': array([0.99409858, 0.98979479, 0.98851656, 0.99859053, 0.99931244,
       0.99675857, 0.99754034, 0.99833067, 0.99985586, 0.99631118,
       0.99960366, 0.97697542, 0.99319504, 0.99248716, 0.99659465,
       0.9981243 , 0.99783425, 0.99895488, 0.99066922, 0.96894758,
       0.99481127, 0.96480458, 0.99080598, 0.99931494, 0.99458728,
       0.99380012, 0.97898358, 0.99801645]), 'PPV': array([0.64859002, 0.65069356, 0.87483981, 0.72142854, 0.8805031 ,
       0.78909951, 0.93299492, 0.83453235, 0.93939381, 0.78736841,
       0.80701744, 0.65080275, 0.6905158 , 0.83781581, 0.81764705,
       0.59689921

{'rms_TPR': 0.4340048368332264,
 'accuracy': 0.8068291982334566,
 'macro_fscore': 0.7401184913339307,
 'micro_fscore': 0.8068291982334566}