

# *fairlib* Evaluation Tutorial


## 1. Installation

In [1]:
!pip install fairlib

Collecting fairlib
  Downloading fairlib-0.0.3-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 29.1 MB/s 
Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 55.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 51.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
[?25hCollecting PyYAML
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
 

In [2]:
import fairlib

In [3]:
!mkdir -p data/deepmoji
!wget 'https://storage.googleapis.com/ai2i/nullspace/deepmoji/pos_pos.npy' -P 'data/deepmoji'
!wget 'https://storage.googleapis.com/ai2i/nullspace/deepmoji/pos_neg.npy' -P 'data/deepmoji'
!wget 'https://storage.googleapis.com/ai2i/nullspace/deepmoji/neg_pos.npy' -P 'data/deepmoji'
!wget 'https://storage.googleapis.com/ai2i/nullspace/deepmoji/neg_neg.npy' -P 'data/deepmoji'

--2022-05-07 15:30:11--  https://storage.googleapis.com/ai2i/nullspace/deepmoji/pos_pos.npy
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.127.128, 172.217.218.128, 142.251.18.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.127.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405494864 (387M) [application/octet-stream]
Saving to: ‘data/deepmoji/pos_pos.npy’


2022-05-07 15:30:14 (139 MB/s) - ‘data/deepmoji/pos_pos.npy’ saved [405494864/405494864]

--2022-05-07 15:30:14--  https://storage.googleapis.com/ai2i/nullspace/deepmoji/pos_neg.npy
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.69.128, 108.177.127.128, 172.217.218.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.69.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405504080 (387M) [application/octet-stream]
Saving to: ‘data/deepmoji/pos_neg.npy’


2022-05-07 15:30:16 (17

In [4]:
fairlib.utils.seed_everything(2022)

import numpy as np
import os

def read_data_file(input_file: str):
    vecs = np.load(input_file)

    np.random.shuffle(vecs)

    return vecs[:40000], vecs[40000:42000], vecs[42000:44000]

in_dir = "data/deepmoji"
out_dir = "data/deepmoji"

os.makedirs(out_dir, exist_ok=True)

for split in ['pos_pos', 'pos_neg', 'neg_pos', 'neg_neg']:
    train, dev, test = read_data_file(in_dir + '/' + split + '.npy')
    for split_dir, data in zip(['train', 'dev', 'test'], [train, dev, test]):
        os.makedirs(out_dir + '/' + split_dir, exist_ok=True)
        np.save(out_dir + '/' + split_dir + '/' + split + '.npy', data)

## Train a Model

In [5]:
Shared_options = {
    # The name of the dataset, corresponding dataloader will be used,
    "dataset":  "Moji",

    # Specifiy the path to the input data
    "data_dir": "data/deepmoji",

    # Device for computing, -1 is the cpu
    "device_id":    -1,

    # The default path for saving experimental results
    "results_dir":  r"results",

    # Will be used for saving experimental results
    "project_dir":  r"dev",

    # We will focusing on TPR GAP, implying the Equalized Odds for binary classification.
    "GAP_metric_name":  "TPR_GAP",

    # The overall performance will be measured as accuracy
    "Performance_metric_name":  "accuracy",

    # Model selections are based on DTO
    "selection_criterion":  "DTO",

    # Default dirs for saving checkpoints
    "checkpoint_dir":   "models",
    "checkpoint_name":  "checkpoint_epoch",

    # Loading experimental results
    "n_jobs":   1,
}

In [6]:
args = {
    "dataset":Shared_options["dataset"], 
    "data_dir":Shared_options["data_dir"],
    "device_id":Shared_options["device_id"],

    # Give a name to the exp, which will be used in the path
    "exp_id":"vanilla",
}

# Init the argument
options = fairlib.BaseOptions()
state = options.get_state(args=args, silence=True)

fairlib.utils.seed_everything(2022)

# Init Model
model = fairlib.networks.get_main_model(state)

INFO:root:Unexpected args: ['-f', '/root/.local/share/jupyter/runtime/kernel-3dad1bfd-dd35-40d7-b985-35feaaff967a.json']
INFO:root:Logging to ./results/dev/Moji/vanilla/output.log


2022-05-07 15:30:29 [INFO ]  Base directory is ./results/dev/Moji/vanilla
Loaded data shapes: (99998, 2304), (99998,), (99998,)
Loaded data shapes: (8000, 2304), (8000,), (8000,)
Loaded data shapes: (7998, 2304), (7998,), (7998,)
2022-05-07 15:30:30 [INFO ]  MLP( 
2022-05-07 15:30:30 [INFO ]    (output_layer): Linear(in_features=300, out_features=2, bias=True)
2022-05-07 15:30:30 [INFO ]    (AF): Tanh()
2022-05-07 15:30:30 [INFO ]    (hidden_layers): ModuleList(
2022-05-07 15:30:30 [INFO ]      (0): Linear(in_features=2304, out_features=300, bias=True)
2022-05-07 15:30:30 [INFO ]      (1): Tanh()
2022-05-07 15:30:30 [INFO ]      (2): Linear(in_features=300, out_features=300, bias=True)
2022-05-07 15:30:30 [INFO ]      (3): Tanh()
2022-05-07 15:30:30 [INFO ]    )
2022-05-07 15:30:30 [INFO ]    (criterion): CrossEntropyLoss()
2022-05-07 15:30:30 [INFO ]  )
2022-05-07 15:30:30 [INFO ]  Total number of parameters: 782402 



In [7]:
model.train_self()

2022-05-07 15:30:38 [INFO ]  Evaluation at Epoch 0
2022-05-07 15:30:38 [INFO ]  Validation accuracy: 72.55	macro_fscore: 72.44	micro_fscore: 72.55	TPR_GAP: 40.07	FPR_GAP: 40.07	PPR_GAP: 39.10	
2022-05-07 15:30:38 [INFO ]  Test accuracy: 71.41	macro_fscore: 71.30	micro_fscore: 71.41	TPR_GAP: 39.01	FPR_GAP: 39.01	PPR_GAP: 37.84	
2022-05-07 15:30:46 [INFO ]  Evaluation at Epoch 1
2022-05-07 15:30:46 [INFO ]  Validation accuracy: 72.36	macro_fscore: 72.32	micro_fscore: 72.36	TPR_GAP: 39.81	FPR_GAP: 39.81	PPR_GAP: 39.27	
2022-05-07 15:30:46 [INFO ]  Test accuracy: 71.01	macro_fscore: 70.98	micro_fscore: 71.01	TPR_GAP: 39.40	FPR_GAP: 39.40	PPR_GAP: 38.64	
2022-05-07 15:30:53 [INFO ]  Epochs since last improvement: 1
2022-05-07 15:30:53 [INFO ]  Evaluation at Epoch 2
2022-05-07 15:30:53 [INFO ]  Validation accuracy: 72.42	macro_fscore: 72.37	micro_fscore: 72.42	TPR_GAP: 40.91	FPR_GAP: 40.91	PPR_GAP: 40.20	
2022-05-07 15:30:53 [INFO ]  Test accuracy: 70.98	macro_fscore: 70.93	micro_fscore: 70.

By default, fairlib print and save 6 metrics:
- accuracy, macro F1 score, and micro F1 score, which are most commenly used evaluation metrics for performance evaluation.
- rms aggregated TPR, FPR, and PPR GAP scores for fairness assesment. 

## Scenario 1: Confusion Matrix Based Metrics

In [8]:
import torch

path = "{results_dir}/{project_dir}/{dataset}/{exp_id}/{checkpoint_dir}/{checkpoint_name}{epoch}.pth.tar"

# Path to the first epoch
path_vanilla_epoch0 = path.format(
    exp_id = "vanilla",
    epoch = "0",
    results_dir=Shared_options["results_dir"],
    project_dir=Shared_options["project_dir"],
    dataset=Shared_options["dataset"],
    checkpoint_dir=Shared_options["checkpoint_dir"],
    checkpoint_name=Shared_options["checkpoint_name"],
)

epoch_results = torch.load(path_vanilla_epoch0)
# The keys for saved items
print(epoch_results.keys())

dict_keys(['epoch', 'epochs_since_improvement', 'loss', 'valid_confusion_matrices', 'test_confusion_matrices', 'dev_evaluations', 'test_evaluations'])


*fairlib* saves confusion matrices for each protected groups as well as the overall confusion matrix. These matrices are stored in a dictionary, indexed with the group id.

In [9]:
epoch_results["valid_confusion_matrices"].keys()

dict_keys(['overall', 0, 1])

In [10]:
epoch_results["valid_confusion_matrices"]["overall"]

array([[2655, 1345],
       [ 851, 3149]])

In [11]:
from fairlib.src.evaluators.evaluator import confusion_matrix_based_scores

In [12]:
confusion_matrix_based_scores(epoch_results["valid_confusion_matrices"]["overall"])

{'ACC': array([0.7255, 0.7255]),
 'FDR': array([0.24272676, 0.29928794]),
 'FNR': array([0.33625, 0.21275]),
 'FPR': array([0.21275, 0.33625]),
 'NPV': array([0.70071206, 0.75727324]),
 'PPR': array([0.43825, 0.56175]),
 'PPV': array([0.75727324, 0.70071206]),
 'TNR': array([0.78725, 0.66375]),
 'TPR': array([0.66375, 0.78725])}

In [13]:
from fairlib.src.evaluators.evaluator import power_mean

In [14]:
numbers = np.array([1,2,3,4,5])
# generalized mean aggregation
[
 power_mean(numbers, p=100), # Max
 power_mean(numbers, p=2), # Root Mean Square
 power_mean(numbers, p=1), # Arithmetic Mean
 power_mean(numbers, p=-100), # Min
]

[5, 3.3166247903554, 3.0, 1]

- Max Violation
- RMS GAP 
- Max Min Fairness

In [15]:
from fairlib.src.evaluators.evaluator import Aggregation_GAP

In [43]:
# Confusion matrices of the vanilla model's first epoch over the vailidation dataset
confusion_matrices = epoch_results["valid_confusion_matrices"]

# all_scores = dict()
all_scores = {}

# Overall evaluation
all_scores["overall"] = confusion_matrix_based_scores(confusion_matrices["overall"])

# Group scores
distinct_groups = [0,1] # binary protected groups, AAE verse SAE
for gid in distinct_groups:
    group_confusion_matrix = confusion_matrices[gid]
    all_scores[gid] = confusion_matrix_based_scores(group_confusion_matrix)

In [68]:
Aggregation_GAP(
    distinct_groups=[0,1], 
    all_scores=all_scores, 

    # Take the absolute different if None, 
    # using generalized mean aggregation if not None.
    group_agg_power = -10,
    
    # RMS aggregation by default
    class_agg_power=2, 
    metric="TPR")

0.2003354978462722