### Setup

In [1]:
import numpy as np
import choix
from scipy.optimize import minimize
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
from matplotlib import colors
import pandas as pd
import seaborn as sns
import pickle
import os
import sys


sys.path.append(os.path.abspath('../../'))
from metrics import compute_acc, compute_weighted_acc
from opt_fair import *
from distribution_utils import safe_kendalltau, to_numpy

In [6]:
!nvidia-smi

Fri Dec 19 12:12:12 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   77C    P0             294W / 300W |  60740MiB / 81920MiB |    100%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [7]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [8]:
print(f"Current PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

Current PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA version: 12.8


In [9]:
with open("data/PassagePC.pickle", 'rb') as handle:
    PC_faceage = pickle.load(handle)    
with open("data/PassageDF.pickle", 'rb') as handle:
    df_faceage = pickle.load(handle)

In [10]:
df_faceage

Unnamed: 0,label,score
0,"a star. Our planet, Earth, orbits, or circles,...",1
1,"Adam, We did not have plastic toys. I played w...",1
2,Who said the little owl. Who wants to hunt wit...,1
3,dead leaf. This is a mole. Moles burrow underg...,1
4,ereaddatagradepsenvironcomp.html Environment r...,1
...,...,...
467,work over the summer on any changes they wish ...,12
468,between January and December plunged the Unite...,12
469,into a newly opened bank account. I was amazed...,12
470,"occurring phenomenon, manmade by products are ...",12


In [11]:
import opt_fair
all_pc_faceage  = opt_fair._pc_without_reviewers(PC_faceage)

size = len(df_faceage)
print(size)
classes = [0]*size

472


In [12]:
print(len(all_pc_faceage))

11763


### Gradient EM

In [17]:
import random
from grad_em import *

In [18]:
Grad_accs, Grad_waccs, Grad_taus = [], [], []
lr = 0.01
for sd in range(10):
    grad_em = GradientEMWrapper(PC_faceage, lr, sd, device)
    r_est, beta_est = grad_em.run_algorithm()
    r_est_np = to_numpy(r_est)
    gt_scores = to_numpy(df_faceage['score'].tolist())
    current_tau = safe_kendalltau(r_est_np, gt_scores)
    if current_tau < 0:
        r_est_np = -r_est_np
    grad_acc = compute_acc(df_faceage, 1*r_est_np, device)
    grad_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
    grad_tau = safe_kendalltau(r_est_np, gt_scores)
    
    Grad_accs.append(grad_acc)    
    Grad_waccs.append(grad_wacc)    
    Grad_taus.append(grad_tau)

100%|██████████████████████████████████| 1000/1000 [00:05<00:00, 168.82it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:05<00:00, 166.75it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:07<00:00, 136.61it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:07<00:00, 134.81it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:05<00:00, 192.69it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:04<00:00, 208.08it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:05<00:00, 189.70it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:07<00:00, 135.11it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:06<00:00, 161.32it/s]



Reached max_epochs without full convergence.


100%|██████████████████████████████████| 1000/1000 [00:05<00:00, 172.79it/s]


Reached max_epochs without full convergence.





In [19]:
print(f"GradEM -- Accuracy: {np.mean(Grad_accs)} ± {np.std(Grad_accs)}, Weighted Accuracy: {np.mean(Grad_waccs)} ± {np.std(Grad_waccs)}, Kendall's Tau: {np.mean(Grad_taus)} ± {np.std(Grad_taus)}")

GradEM -- Accuracy: 0.6985240876674652 ± 0.0008998396815844701, Weighted Accuracy: 0.7583310902118683 ± 0.0009670574004366093, Kendall's Tau: 0.3748170366665421 ± 0.0016989014194708297


### PG EM

In [13]:
import random
from pgem import EMWrapper

In [14]:
max_iter = 100

PGEM_accs, PGEM_waccs, PGEM_taus = [], [], []

for sd in range(10):
    pg = EMWrapper(PC_faceage, max_iter, device, sd)
    r_est, beta_est, ll = pg.run_algorithm()
    if np.isnan(r_est).any() or np.isnan(beta_est).any() or np.isnan(ll):
        print("Skipping nan")
        continue
    r_est_np = to_numpy(r_est)
    gt_scores = to_numpy(df_faceage['score'].tolist())
    current_tau = safe_kendalltau(r_est_np, gt_scores)
    if current_tau < 0:
        r_est_np = -r_est_np
    pgem_acc = compute_acc(df_faceage, 1*r_est_np, device)
    pgem_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
    pgem_tau = safe_kendalltau(r_est_np, gt_scores)
    
    PGEM_accs.append(pgem_acc)    
    PGEM_waccs.append(pgem_wacc)    
    PGEM_taus.append(pgem_tau)

cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:26,  3.68it/s]

Iter 000: Log-likelihood = -0.454190


 34%|████████████▉                         | 34/100 [00:05<00:10,  6.27it/s]


Converged at iter 34, Log-likelihood change = -4.768372e-07
cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:14,  6.73it/s]

Iter 000: Log-likelihood = -0.453150


 20%|███████▌                              | 20/100 [00:03<00:12,  6.47it/s]


Converged at iter 20, Log-likelihood change = -1.490116e-07
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:15,  6.55it/s]

Iter 000: Log-likelihood = -0.455186


 12%|████▌                                 | 12/100 [00:01<00:14,  6.10it/s]


Converged at iter 12, Log-likelihood change = 0.000000e+00
cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:15,  6.45it/s]

Iter 000: Log-likelihood = -0.454898


 31%|███████████▊                          | 31/100 [00:04<00:09,  7.49it/s]


Converged at iter 31, Log-likelihood change = 9.834766e-07
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:10,  9.51it/s]

Iter 000: Log-likelihood = -0.455264


  9%|███▌                                   | 9/100 [00:01<00:12,  7.39it/s]


Converged at iter 9, Log-likelihood change = -2.980232e-08
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:11,  8.45it/s]

Iter 000: Log-likelihood = -0.454877


 10%|███▊                                  | 10/100 [00:01<00:12,  7.33it/s]


Converged at iter 10, Log-likelihood change = 2.384186e-07
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:11,  8.32it/s]

Iter 000: Log-likelihood = -0.455003


 14%|█████▎                                | 14/100 [00:01<00:10,  7.98it/s]


Converged at iter 14, Log-likelihood change = -6.556511e-07
cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:17,  5.69it/s]

Iter 000: Log-likelihood = -0.585252


 11%|████▏                                 | 11/100 [00:01<00:11,  7.47it/s]


Converged at iter 11, Log-likelihood change = -1.490116e-07
cuda
cuda


  1%|▍                                      | 1/100 [00:00<00:11,  8.95it/s]

Iter 000: Log-likelihood = -0.453979


 26%|█████████▉                            | 26/100 [00:03<00:09,  8.04it/s]


Converged at iter 26, Log-likelihood change = -4.172325e-07
cuda
cuda


  2%|▊                                      | 2/100 [00:00<00:12,  7.84it/s]

Iter 000: Log-likelihood = -0.454710


 17%|██████▍                               | 17/100 [00:02<00:10,  8.06it/s]

Converged at iter 17, Log-likelihood change = -8.642673e-07





In [15]:
PGEM_accs

[0.6941558718681335,
 0.6948524713516235,
 0.6951048374176025,
 0.6942265629768372,
 0.6951553225517273,
 0.6953774094581604,
 0.6948827505111694,
 0.6935905814170837,
 0.6943678855895996,
 0.694751501083374]

In [16]:
print(f"PGEM -- Accuracy: {np.mean(PGEM_accs)} ± {np.std(PGEM_accs)}, Weighted Accuracy: {np.mean(PGEM_waccs)} ± {np.std(PGEM_waccs)}, Kendall's Tau: {np.mean(PGEM_taus)} ± {np.std(PGEM_taus)}")

PGEM -- Accuracy: 0.6946465194225311 ± 0.0005217393074741697, Weighted Accuracy: 0.755181896686554 ± 0.0006608944057229961, Kendall's Tau: 0.3674961233890967 ± 0.0009850565225552452


### BT

In [13]:
%%time
bt_scores = choix.opt_pairwise(size, all_pc_faceage, alpha=0, method='Newton-CG', initial_params=None, max_iter=None, tol=1e-05)

CPU times: user 45 s, sys: 2min 16s, total: 3min 1s
Wall time: 7.12 s


In [14]:
r_est_np = to_numpy(bt_scores)
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
bt_acc = compute_acc(df_faceage, 1*r_est_np, device)
bt_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
bt_tau = safe_kendalltau(r_est_np, gt_scores)

In [15]:
print(f"Simple BT -- Accuracy: {bt_acc}, Weighted Accuracy: {bt_wacc}, Kendall's Tau: {bt_tau} ")

Simple BT -- Accuracy: 0.6804668307304382, Weighted Accuracy: 0.74333256483078, Kendall's Tau: 0.3408721163582005 


### BARP

In [16]:
# classes = [0]
FaceAge = opt_fair.BARP(data = PC_faceage, penalty = 0, classes = classes, device=device)

In [17]:
crowd_labels = pd.read_csv('data/passage_cleaned.csv')
num_reviewers =  crowd_labels['performer'].nunique()

In [18]:
%%time
annot_bt_temp, annot_bias =  opt_fair._alternate_optim_torch(size, num_reviewers, FaceAge, iters = 100)

100%|█████████████████████████████████████| 100/100 [01:37<00:00,  1.03it/s]

CPU times: user 1min 1s, sys: 36.3 s, total: 1min 38s
Wall time: 1min 38s





In [19]:
r_est_np = to_numpy(annot_bt_temp)
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
barp_acc = compute_acc(df_faceage, 1*r_est_np, device)
barp_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
barp_tau = safe_kendalltau(r_est_np, gt_scores)

In [20]:
print(f"BARP -- Accuracy: {barp_acc}, Weighted Accuracy: {barp_wacc}, Kendall's Tau: {barp_tau} ")

BARP -- Accuracy: 0.6920762658119202, Weighted Accuracy: 0.7587689757347107, Kendall's Tau: 0.36267695314604337 


### RC

In [21]:
%%time

rc_obj = RankCentrality(device)
A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
# print("A matrix done")
P = rc_obj.trans_prob(A)
# print("P matrix done")
pi = rc_obj.stationary_dist(P)
rank_centrality_scores = torch.log(pi).cpu().numpy()

CPU times: user 45.1 ms, sys: 35.2 ms, total: 80.3 ms
Wall time: 78.1 ms


In [22]:
r_est_np = to_numpy(rank_centrality_scores)
current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
rc_acc = compute_acc(df_faceage, 1*r_est_np, device)
rc_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
rc_tau = safe_kendalltau(r_est_np, gt_scores)

In [23]:
print(f"RC -- Accuracy: {rc_acc}, Weighted Accuracy: {rc_wacc}, Kendall's Tau: {rc_tau} ")

RC -- Accuracy: 0.6521800756454468, Weighted Accuracy: 0.71303790807724, Kendall's Tau: 0.2891796316210329 


### CrowdBT

In [30]:
crowd_labels = pd.read_csv('data/passage_cleaned.csv')
num_reviewers =  crowd_labels['performer'].nunique()

In [31]:
print(device)
gt_scores = to_numpy(df_faceage['score'].tolist())

cuda


In [33]:
CrowdBT_accs, CrowdBT_waccs, CrowdBT_taus = [], [], []
K = num_reviewers
gt_df = df_faceage
for seed in range(10):
    try:
        crowdbt_test = opt_fair.CrowdBT_3_0(data=PC_faceage, penalty=0, device=device, random_seed=seed)
        crowdbt_scores, _ = crowdbt_test.alternate_optim(size, K)
        r_est_np = to_numpy(crowdbt_scores)
        gt_scores = to_numpy(df_faceage['score'].tolist())
        current_tau = safe_kendalltau(r_est_np, gt_scores)
        if current_tau < 0:
            r_est_np = -r_est_np
        crowdbt_acc = compute_acc(df_faceage, 1*r_est_np, device)
        crowdbt_wacc = compute_weighted_acc(df_faceage, 1*r_est_np, device)
        crowdbt_tau = safe_kendalltau(r_est_np, gt_scores)
        CrowdBT_accs.append(crowdbt_acc)
        CrowdBT_waccs.append(crowdbt_wacc)
        CrowdBT_taus.append(crowdbt_tau)
    except Exception as e:
        print(f"CrowdBT seed {seed} failed for N={N},K={K} with error {e}; appending zeros")
        CrowdBT_accs.append(0.0)
        CrowdBT_waccs.append(0.0)
        CrowdBT_taus.append(0.0)

100%|██████████████████████| 100/100 [00:00<00:00, 172.99it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 234.26it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 264.95it/s, loss=5.06e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 221.34it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 162.98it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 131.20it/s, loss=5.06e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 125.61it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 168.03it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 167.49it/s, loss=5.07e+3]
100%|██████████████████████| 100/100 [00:00<00:00, 173.20it/s, loss=5.07e+3]


In [34]:
print(f"CrowdBT -- Accuracy: {np.mean(CrowdBT_accs)} ± {np.std(CrowdBT_accs)}, Weighted Accuracy: {np.mean(CrowdBT_waccs)} ± {np.std(CrowdBT_waccs)}, Kendall's Tau: {np.mean(CrowdBT_taus)} ± {np.std(CrowdBT_taus)}")

CrowdBT -- Accuracy: 0.7044227123260498 ± 0.0008876429468347528, Weighted Accuracy: 0.7640507102012635 ± 0.000879096424574269, Kendall's Tau: 0.38595374488838885 ± 0.001675885754137508


### FactorBT

In [24]:
from crowdkit.aggregation import NoisyBradleyTerry

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
def sort_df(df, column_name):
    # Sort by a specific column (replace 'column_name' with your column)
    df_sorted = df.sort_values(by=column_name, ascending=True)  # or ascending=False

    return df_sorted


df = pd.read_csv("data/passage_cleaned.csv")
df = df.rename(columns={'performer': 'worker'})

agg_noisybt = NoisyBradleyTerry(n_iter=10).fit_predict(df)
agg_noisybt_df = pd.DataFrame(list(agg_noisybt.items()), columns=['label', 'score'])
agg_noisybt_df = sort_df(agg_noisybt_df, 'label')
factorbt_scores = list(agg_noisybt_df['score'])

In [26]:
# %%time

# factorbt_test = FactorBT(data = PC_faceage, penalty = 0, classes = classes, device=device)
# factorbt_scores,y,z = factorbt_test.alternate_optim(iters=100)

In [27]:
gt_df = pd.read_csv("data/gt_df_passage.csv")
gt_df = sort_df(gt_df, 'label')
gt_scores = to_numpy(gt_df['score'].tolist())
r_est_np = to_numpy(factorbt_scores)

current_tau = safe_kendalltau(r_est_np, gt_scores)
if current_tau < 0:
    r_est_np = -r_est_np
factorbt_acc = compute_acc(gt_df, 1*r_est_np, device)
factorbt_wacc = compute_weighted_acc(gt_df, 1*r_est_np, device)
factorbt_tau = safe_kendalltau(r_est_np, gt_scores)

In [28]:
print(f"FactorBT -- Accuracy: {factorbt_acc}, Weighted Accuracy: {factorbt_wacc}, Kendall's Tau: {factorbt_tau} ")

FactorBT -- Accuracy: 0.6973661780357361, Weighted Accuracy: 0.74295973777771, Kendall's Tau: 0.3726308644381563 
