### Setup

In [2]:
import numpy as np
import choix
from scipy.optimize import minimize
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
from matplotlib import colors
import pandas as pd
import seaborn as sns
import pickle
import os
import sys

sys.path.append(os.path.abspath('../../'))
from metrics import compute_acc, compute_weighted_acc
from opt_fair import *
from distribution_utils import safe_kendalltau, to_numpy

In [3]:
!nvidia-smi

Mon Dec 22 08:26:14 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   71C    P0             286W / 300W |  50814MiB / 81920MiB |    100%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [4]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
print(f"Current PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

Current PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA version: 12.8


In [6]:
with open("data/FaceAgePC.pickle", 'rb') as handle:
    PC_faceage = pickle.load(handle)    
with open("data/FaceAgeDF.pickle", 'rb') as handle:
    df_faceage = pickle.load(handle)

In [7]:
df_faceage

Unnamed: 0,full_path,score,gender
0,nm1442940_rm3965098752_1996-10-3_2006.jpg,10,0.0
1,nm4832920_rm1781768448_2003-8-28_2013.jpg,10,0.0
2,nm0652089_rm860657920_1992-3-10_2002.jpg,10,0.0
3,nm0004917_rm1493730304_1969-5-12_1979.jpg,10,0.0
4,nm1113550_rm1332711936_1996-4-14_2006.jpg,10,0.0
...,...,...,...
9145,475367_1941-08-03_2011.jpg,70,1.0
9146,304085_1919-07-07_1989.jpg,70,1.0
9147,nm0001627_rm4164078592_1927-2-20_1997.jpg,70,1.0
9148,nm0000024_rm1715129344_1904-4-14_1974.jpg,70,1.0


In [8]:
import opt_fair
all_pc_faceage  = opt_fair._pc_without_reviewers(PC_faceage)

size = len(df_faceage)
print(size)

9150


In [9]:
print(len(all_pc_faceage))

250249


### Gradient EM

In [9]:
import random
from grad_em import *

In [10]:
Grad_accs, Grad_waccs, Grad_taus = [], [], []
lr = 0.01
for sd in range(10):
    grad_em = GradientEMWrapper(PC_faceage, lr, sd, device)
    r_est, beta_est = grad_em.run_algorithm()
    r_est_np = to_numpy(r_est)
    gt_scores = to_numpy(df_faceage['score'].tolist())
    current_tau = safe_kendalltau(r_est_np, gt_scores)
    if current_tau < 0:
        r_est_np = -r_est_np
    grad_acc = compute_acc(df_faceage, 1*r_est_np, device)
    grad_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
    grad_tau = safe_kendalltau(r_est_np, gt_scores)
    
    Grad_accs.append(grad_acc)    
    Grad_waccs.append(grad_wacc)    
    Grad_taus.append(grad_tau)

100%|██████████████████████████████████| 1000/1000 [00:07<00:00, 141.91it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 153.51it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:04<00:00, 208.88it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:04<00:00, 211.46it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 152.37it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 151.28it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:07<00:00, 129.31it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 149.59it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:08<00:00, 115.55it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 152.87it/s]



Reached max_epochs without full convergence.


In [11]:
print(f"GradEM -- Accuracy: {np.mean(Grad_accs)} ± {np.std(Grad_accs)}, Weighted Accuracy: {np.mean(Grad_waccs)} ± {np.std(Grad_waccs)}, Kendall's Tau: {np.mean(Grad_taus)} ± {np.std(Grad_taus)}")

GradEM -- Accuracy: 0.7915597438812256 ± 6.419252992928882e-05, Weighted Accuracy: 0.8731638073921204 ± 6.222083046756577e-05, Kendall's Tau: 0.5783516802614771 ± 0.00012732512702058125


### PG EM

In [9]:
import random
from pgem import EMWrapper

In [10]:
max_iter = 100

PGEM_accs, PGEM_waccs, PGEM_taus = [], [], []

for sd in range(10):
    pg = EMWrapper(PC_faceage, max_iter, device, sd)
    r_est, beta_est, ll = pg.run_algorithm()
    if np.isnan(r_est).any() or np.isnan(beta_est).any() or np.isnan(ll):
        print("Skipping nan")
        continue
    
    r_est_np = to_numpy(r_est)
    
    gt_scores = to_numpy(df_faceage['score'].tolist())
    current_tau = safe_kendalltau(r_est_np, gt_scores)
    if current_tau < 0:
        r_est_np = -r_est_np
    pgem_acc = compute_acc(df_faceage, 1*r_est_np, device)
    pgem_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
    pgem_tau = safe_kendalltau(r_est_np, gt_scores)
    
    PGEM_accs.append(pgem_acc)    
    PGEM_waccs.append(pgem_wacc)    
    PGEM_taus.append(pgem_tau)

cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:44,  2.24it/s]

Iter 000: Log-likelihood = -0.357451


 35%|█████████████▎                        | 35/100 [00:10<00:18,  3.43it/s]

Converged at iter 35, Log-likelihood change = 8.046627e-07
cuda





cuda


  1%|▍                                      | 1/100 [00:00<00:11,  8.96it/s]

Iter 000: Log-likelihood = -0.357346


 31%|███████████▊                          | 31/100 [00:08<00:17,  3.86it/s]

Converged at iter 31, Log-likelihood change = 8.940697e-07
cuda





cuda


  1%|▍                                      | 1/100 [00:00<00:32,  3.02it/s]

Iter 000: Log-likelihood = -0.357988


100%|█████████████████████████████████████| 100/100 [00:28<00:00,  3.54it/s]


cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:17,  5.81it/s]

Iter 000: Log-likelihood = -0.357759


 32%|████████████▏                         | 32/100 [00:09<00:21,  3.22it/s]


Converged at iter 32, Log-likelihood change = 0.000000e+00
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:17,  5.74it/s]

Iter 000: Log-likelihood = -0.357405


100%|█████████████████████████████████████| 100/100 [00:23<00:00,  4.34it/s]


cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:14,  6.68it/s]

Iter 000: Log-likelihood = -0.357600


 32%|████████████▏                         | 32/100 [00:08<00:18,  3.77it/s]

Converged at iter 32, Log-likelihood change = 2.682209e-07
cuda





cuda


  1%|▍                                      | 1/100 [00:00<00:15,  6.27it/s]

Iter 000: Log-likelihood = -0.357491


100%|█████████████████████████████████████| 100/100 [00:17<00:00,  5.58it/s]


cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:09, 10.61it/s]

Iter 000: Log-likelihood = -0.357511


 47%|█████████████████▊                    | 47/100 [00:09<00:11,  4.81it/s]

Converged at iter 47, Log-likelihood change = 7.450581e-07
cuda





cuda


  2%|▊                                      | 2/100 [00:00<00:05, 18.48it/s]

Iter 000: Log-likelihood = -0.357571


 31%|███████████▊                          | 31/100 [00:01<00:02, 29.59it/s]


Converged at iter 31, Log-likelihood change = 9.536743e-07
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:19,  5.02it/s]

Iter 000: Log-likelihood = -0.357552


 27%|██████████▎                           | 27/100 [00:06<00:16,  4.39it/s]

Converged at iter 27, Log-likelihood change = 2.682209e-07





In [11]:
PGEM_accs

[0.7917684316635132,
 0.7918282747268677,
 0.7916222214698792,
 0.791897177696228,
 0.7914688587188721,
 0.7918448448181152,
 0.7915946841239929,
 0.7919027805328369,
 0.7918227910995483,
 0.7918305993080139]

In [12]:
print(f"PGEM -- Accuracy: {np.mean(PGEM_accs)} ± {np.std(PGEM_accs)}, Weighted Accuracy: {np.mean(PGEM_waccs)} ± {np.std(PGEM_waccs)}, Kendall's Tau: {np.mean(PGEM_taus)} ± {np.std(PGEM_taus)}")

PGEM -- Accuracy: 0.7917580664157867 ± 0.00013823902492913592, Weighted Accuracy: 0.8732296645641326 ± 0.00012233771098530712, Kendall's Tau: 0.5787451048521977 ± 0.0002742117532629092


### BT

In [20]:
%%time
bt_scores = choix.opt_pairwise(size, all_pc_faceage, alpha=0, method='Newton-CG', initial_params=None, max_iter=None, tol=1e-05)

CPU times: user 10min 28s, sys: 3min 16s, total: 13min 44s
Wall time: 1min 56s


In [21]:
r_est_np = to_numpy(bt_scores)
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
bt_acc = compute_acc(df_faceage, 1*r_est_np, device)
bt_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
bt_tau = safe_kendalltau(r_est_np, gt_scores)

In [22]:
print(f"Simple BT -- Accuracy: {bt_acc}, Weighted Accuracy: {bt_wacc}, Kendall's Tau: {bt_tau} ")

Simple BT -- Accuracy: 0.7900686264038086, Weighted Accuracy: 0.8719564080238342, Kendall's Tau: 0.575393885555222 


### BARP

In [10]:
classes = df_faceage['gender']
FaceAge = opt_fair.BARP(data = PC_faceage, penalty = 0, classes = classes, device=device)

In [11]:
crowd_labels = pd.read_csv('data/crowd_labels.csv')
num_reviewers =  crowd_labels['performer'].nunique()

In [12]:
%%time
annot_bt_temp, annot_bias =  opt_fair._alternate_optim_torch(size, num_reviewers, FaceAge, iters = 100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [08:03<00:00,  4.83s/it]

CPU times: user 6min 1s, sys: 2min 3s, total: 8min 4s
Wall time: 8min 4s





In [13]:
r_est_np = to_numpy(annot_bt_temp)
gt_scores = to_numpy(df_faceage['score'].tolist())
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
barp_acc = compute_acc(df_faceage, 1*r_est_np, device)
barp_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
barp_tau = safe_kendalltau(r_est_np, gt_scores)

In [14]:
print(f"BARP -- Accuracy: {barp_acc}, Weighted Accuracy: {barp_wacc}, Kendall's Tau: {barp_tau} ")

BARP -- Accuracy: 0.7911810278892517, Weighted Accuracy: 0.8728458285331726, Kendall's Tau: 0.5776003949914079 


### RC

In [28]:
%%time

rc_obj = RankCentrality(device)
A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
# print("A matrix done")
P = rc_obj.trans_prob(A)
# print("P matrix done")
pi = rc_obj.stationary_dist(P)
rank_centrality_scores = torch.log(pi).cpu().numpy()

CPU times: user 1.07 s, sys: 1.45 s, total: 2.52 s
Wall time: 2.52 s


In [29]:
r_est_np = to_numpy(rank_centrality_scores)
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
rc_acc = compute_acc(df_faceage, 1*r_est_np, device)
rc_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
rc_tau = safe_kendalltau(r_est_np, gt_scores)

In [30]:
print(f"RC -- Accuracy: {rc_acc}, Weighted Accuracy: {rc_wacc}, Kendall's Tau: {rc_tau} ")

RC -- Accuracy: 0.7804011106491089, Weighted Accuracy: 0.8644054532051086, Kendall's Tau: 0.5582118883509699 


### CrowdBT

In [9]:
crowd_labels = pd.read_csv('data/crowd_labels.csv')
num_reviewers =  crowd_labels['performer'].nunique()

In [10]:
print(device)
gt_scores = to_numpy(df_faceage['score'].tolist())

cuda


In [12]:
CrowdBT_accs, CrowdBT_waccs, CrowdBT_taus = [], [], []
K = num_reviewers
gt_df = df_faceage
for seed in range(10):
    try:
        crowdbt_test = opt_fair.CrowdBT_3_0(data=PC_faceage, penalty=0, device=device, random_seed=seed)
        crowdbt_scores, _ = crowdbt_test.alternate_optim(size, K)
        r_est_np = to_numpy(crowdbt_scores)
        gt_scores = to_numpy(df_faceage['score'].tolist())
        current_tau = safe_kendalltau(r_est_np, gt_scores)
        if current_tau < 0:
            r_est_np = -r_est_np
        crowdbt_acc = compute_acc(df_faceage, 1*r_est_np, device)
        crowdbt_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
        crowdbt_tau = safe_kendalltau(r_est_np, gt_scores)
        CrowdBT_accs.append(crowdbt_acc)
        CrowdBT_waccs.append(crowdbt_wacc)
        CrowdBT_taus.append(crowdbt_tau)
    except Exception as e:
        print(e)
        CrowdBT_accs.append(0.0)
        CrowdBT_waccs.append(0.0)
        CrowdBT_taus.append(0.0)

100%|██████████████████████| 100/100 [00:00<00:00, 146.48it/s, loss=7.06e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 185.77it/s, loss=7.04e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 154.28it/s, loss=7.06e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 123.16it/s, loss=7.04e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 137.62it/s, loss=7.06e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 180.79it/s, loss=7.05e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 158.45it/s, loss=7.04e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 149.24it/s, loss=7.06e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 179.68it/s, loss=7.06e+4]
100%|██████████████████████| 100/100 [00:00<00:00, 171.89it/s, loss=7.05e+4]


In [13]:
print(f"CrowdBT -- Accuracy: {np.mean(CrowdBT_accs)} ± {np.std(CrowdBT_accs)}, Weighted Accuracy: {np.mean(CrowdBT_waccs)} ± {np.std(CrowdBT_waccs)}, Kendall's Tau: {np.mean(CrowdBT_taus)} ± {np.std(CrowdBT_taus)}")

CrowdBT -- Accuracy: 0.7906020522117615 ± 0.00020234240149123216, Weighted Accuracy: 0.8724876284599304 ± 0.00013249322393198598, Kendall's Tau: 0.5764519553075644 ± 0.00040137813726117994


### FactorBT

In [16]:
from crowdkit.aggregation import NoisyBradleyTerry

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def sort_df(df, column_name):
    # Sort by a specific column (replace 'column_name' with your column)
    df_sorted = df.sort_values(by=column_name, ascending=True)  # or ascending=False

    return df_sorted

df = pd.read_csv('data/crowd_labels.csv')
df = df.rename(columns={'performer': 'worker'})

agg_noisybt = NoisyBradleyTerry(n_iter=10).fit_predict(df)
agg_noisybt_df = pd.DataFrame(list(agg_noisybt.items()), columns=['label', 'score'])
agg_noisybt_df = sort_df(agg_noisybt_df, 'label')
factorbt_scores = list(agg_noisybt_df['score'])

In [21]:
gt_df = pd.read_csv("data/gt.csv")
gt_df = sort_df(gt_df, 'label')
gt_scores = to_numpy(gt_df['score'].tolist())
r_est_np = to_numpy(factorbt_scores)

current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
factorbt_acc = compute_acc(gt_df, 1*r_est_np, device)
factorbt_wacc = compute_weighted_acc(gt_df, 1*r_est_np, device)
factorbt_tau = safe_kendalltau(r_est_np, gt_scores)

In [22]:
print(f"FactorBT -- Accuracy: {factorbt_acc}, Weighted Accuracy: {factorbt_wacc}, Kendall's Tau: {factorbt_tau} ")

FactorBT -- Accuracy: 0.7904868125915527, Weighted Accuracy: 0.8285281658172607, Kendall's Tau: 0.5762237655043003 
