In [4]:
!pip install choix

Collecting choix
  Downloading choix-0.3.6-py3-none-any.whl.metadata (5.6 kB)
Downloading choix-0.3.6-py3-none-any.whl (18 kB)
Installing collected packages: choix
Successfully installed choix-0.3.6


In [5]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import ast
from torch.utils.data import DataLoader, TensorDataset
from torch.amp import GradScaler, autocast
import choix
from scipy.stats import kendalltau, spearmanr, rankdata
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
generator = np.random.default_rng(42)

In [7]:
expert_df = pd.read_csv('/kaggle/input/dataset-final/expert.csv')
amateur_df = pd.read_csv('/kaggle/input/dataset-final/amateur.csv')
spammer_df = pd.read_csv('/kaggle/input/dataset-final/spammer.csv')
malicious_df = pd.read_csv('/kaggle/input/dataset-final/malicious.csv')

In [8]:
n_items = 60
n_initial_samples = 100
n_iterations = 100
n_samples_per_iter = 5

In [6]:
for df in (expert_df, amateur_df, spammer_df, malicious_df):
    df['subset'] = df['subset'].apply(ast.literal_eval)
    df['true rank'] = df['true rank'].apply(ast.literal_eval)
    df['noisy rank'] = df['noisy rank'].apply(ast.literal_eval)

In [7]:
def kendall_tau_score(y_true, y_pred):
    tau, _ = kendalltau(y_true, y_pred)
    return tau

def pl_predict(data):
    return choix.lsr_rankings(n_items=60, data=data)

In [8]:
true_ranking = [35, 21, 22, 58, 37, 43, 42, 41, 24, 26, 3, 30, 31, 18, 12, 6, 49, 0, 38, 27, 11, 53, 57, 14, 32, 23, 10, 8, 25, 39, 47, 54, 59, 44, 2, 52, 46, 13, 20, 45, 28, 1, 33, 40, 5, 51, 15, 50, 9, 55, 56, 7, 19, 29, 36, 48, 34, 17, 16, 4]

## BASELINE MODEL

In [102]:
expert_pl = choix.lsr_rankings(n_items=60, data = expert_df['noisy rank'])
choix.probabilities(range(60), expert_pl)
predicted_ranking_exp = expert_pl.argsort()
print(predicted_ranking_exp)
spearman_corr, p_value = spearmanr(true_ranking, predicted_ranking_exp)
print(f"Spearman: {spearman_corr}")
print(f"p_value: {p_value}")
tau, p_value = kendalltau(true_ranking, predicted_ranking_exp)
tau_distance = 1-tau
print(f"Kendall tau {tau}")
print(f"tau distance {tau_distance}")

[35 21 42 41 22 58 43 30 37 26 24 31 18  3 38  6 57  0 12 49 11 27 10 53
 14 32 23 47 54 25 39 44 59  8 46  2 52 20 45 13 33  1 28  5 40 15 50 51
  9 55 56  7 19 29 36 48 34 17 16  4]
Spearman: 0.32325646012781334
p_value: 0.011758923306073626
Kendall tau 0.2327683615819209
tau distance 0.7672316384180791


In [12]:
amateur_pl = choix.lsr_rankings(n_items=60, data = amateur_df['noisy rank'])
choix.probabilities(range(60), amateur_pl)
predicted_ranking_am = amateur_pl.argsort()
print(predicted_ranking_am)
spearman_corr, p_value = spearmanr(true_ranking, predicted_ranking_am)
print(f"Spearman: {spearman_corr}")
print(f"p_value: {p_value}")
tau, p_value = kendalltau(true_ranking, predicted_ranking_am)
tau_distance = 1-tau
print(f"Kendall tau {tau}")
print(f"tau distance {tau_distance}")

[21 43 22 58 37 35 26 24  3 42 41 30 31 27  6 12 18 11 38  0 49 32 14 57
 23 53 47 10 39  8 25 46  2 44 52 59 54 20 40 45 13 28 33 15  1 51 50  5
 55 48 36 17 19 29  7 56 34  9 16  4]
Spearman: 0.17126979716587942
p_value: 0.1907259127434609
Kendall tau 0.13333333333333333
tau distance 0.8666666666666667


In [13]:
spammer_pl = choix.lsr_rankings(n_items=60, data = spammer_df['noisy rank'])
choix.probabilities(range(60), spammer_pl)
predicted_ranking_spm = spammer_pl.argsort()
print(predicted_ranking_spm)
spearman_corr, p_value = spearmanr(true_ranking, predicted_ranking_spm)
print(f"Spearman: {spearman_corr}")
print(f"p_value: {p_value}")
tau, p_value = kendalltau(true_ranking, predicted_ranking_spm)
tau_distance = 1-tau
print(f"Kendall tau {tau}")
print(f"tau distance {tau_distance}")

[35 22 43 37 24 31 42 18 21 30 41  6 23 26 49  3 57 38 11 14 12 58  0 27
 53 32 25  2 54 39 15 59  1  8 56 51 45 52 50 34 33 46 28 55  7  5 19 10
 44  4 47  9 13 48 17 20 16 29 40 36]
Spearman: -0.041289247013059194
p_value: 0.7541052486127182
Kendall tau -0.02824858757062147
tau distance 1.0282485875706215


In [14]:
malicious_pl = choix.lsr_rankings(n_items=60, data = malicious_df['noisy rank'])
choix.probabilities(range(60), malicious_pl)
predicted_ranking_mal =malicious_pl.argsort()
print(predicted_ranking_mal)
spearman_corr, p_value = spearmanr(true_ranking, predicted_ranking_mal)
print(f"Spearman: {spearman_corr}")
print(f"p_value: {p_value}")
tau, p_value = kendalltau(true_ranking, predicted_ranking_mal)
tau_distance = 1-tau
print(f"Kendall tau {tau}")
print(f"tau distance {tau_distance}")

[ 4 35 56  1 24 37 21 30 22  5  7 26 55 31  9 27 12 41 54 53 43 38 52 18
 51 14 49 50 59 42 39  2 10 11  8 44 23 16 15 32 13 20 46 25 47 48 40 28
 57 45 19  6 17 33  0  3 34 29 36 58]
Spearman: -0.10286190608502364
p_value: 0.43416643837922786
Kendall tau -0.08022598870056497
tau distance 1.080225988700565


## ROPAL

In [10]:
#initialize item preferences
mu = {i: np.random.normal(0, 1) for i in range(n_items)}
#initialize item uncertainty
sigma2 = {i: 1.0 for i in range(n_items)} 
#initialize worker quality
alpha = {w: 1.0 for w in range(100)}

In [11]:
def num_comparisons(k):
    return k * (k - 1) // 2

In [12]:
def update_scores(mu, sigma2, ranking, alpha_w):
    k = len(ranking)
    for i in range (k):
        for j in range (i+1, k):
            o1, o2 = ranking[i], ranking[j]

            #score update
            delta = alpha_w * (mu[o1] - mu[o2])
            mu[o1] += 0.01 * delta
            mu[o2] -= 0.01 * delta

            #variance update
            sigma2[o1] *= 0.99
            sigma2[o2] *= 0.99


In [None]:
def update_worker_quality(alpha, worker_id, ranking):
    k = len(ranking)
    alpha_w = alpha[worker_id]

    alpha_w += 0.01 * (num_comparisons(k) - alpha_w)
    alpha[worker_id] = max(0.1, alpha_w)

In [None]:
T=5 #iterations
for t in range (T):
    print(f"Iteration {t+1}/{T}")
    generator.shuffle(dataset)
    for worker_id, ranking in dataset:
        update_scores(mu, sigma2, ranking, alpha[worker_id])
        update_worker_quality(alpha, worker_id, ranking)

final_ranking = sorted(mu.items(), key=lambda x: x[1], reverse=True)

## ACTIVE LEARNING

# Random sampling

In [19]:
def random_sampling(dataset, n_samples):
    samples = generator.choice(dataset['noisy rank'], n_samples, replace=False)
    return [list(sample) for sample in samples]

In [22]:
random_samples = random_sampling(expert_df, n_items, n_initial_samples)

expert_params = choix.lsr_rankings(n_items=n_items, data=random_samples, alpha=1e-2)
initial_ranking = expert_params.argsort()
spearman_corr, p_value_s = spearmanr(true_ranking, initial_ranking)
tau, p_value_k = kendalltau(true_ranking, initial_ranking)
tau_distance = 1 - tau

print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

Spearman: 0.17843845512642403, p_value: 0.17254277555486308
Kendall tau: 0.12203389830508474, tau distance: 0.8779661016949153


In [None]:

for i in range(len(expert_df)):
    new_samples = random_sampling(expert_df, n_items, n_samples_per_iter)
    random_samples.extend(new_samples)

    expert_params = choix.lsr_rankings(n_items=n_items, data=random_samples, alpha=1e-2, initial_params=expert_params)
    predicted_ranking_exp = expert_params.argsort()

    spearman_corr, p_value_s = spearmanr(true_ranking, predicted_ranking_exp)
    tau, p_value_k = kendalltau(true_ranking, predicted_ranking_exp)
    tau_distance = 1 - tau
    
    if (i % 100 == 0):
        print(f"\nIteration {i+1}:")
        print(f"Updated model with {len(random_samples)} triplets")
        print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
        print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

## Uncertainty sampling

In [57]:
initial_samples = random_sampling(expert_df, n_items, n_initial_samples)

expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2)
initial_ranking = expert_params.argsort()
spearman_corr, p_value_s = spearmanr(true_ranking, initial_ranking)
tau, p_value_k = kendalltau(true_ranking, initial_ranking)
tau_distance = 1 - tau

print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

Spearman: -0.10519588774659629, p_value: 0.423753776318918
Kendall tau: -0.06779661016949153, tau distance: 1.0677966101694916


In [54]:
def uncertainty_sampling(dataset, n_items, n_samples, model_params):
    model_params = torch.tensor(model_params, dtype=torch.float32, device="cuda")
    samples = torch.tensor(dataset['noisy rank'], dtype=torch.int64, device="cuda")
    probs = torch.softmax(model_params[samples], dim=-1)  
    sample_entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1)
    top_indices = torch.argsort(sample_entropy, descending=True)[:n_samples]
    selected_samples = samples[top_indices].cpu().tolist()

    return selected_samples

In [None]:
for i in range(len(expert_df)):
    new_samples = uncertainty_sampling(expert_df, n_items, n_samples_per_iter, expert_params)
    initial_samples.extend(new_samples)

    expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2, initial_params=expert_params)
    predicted_ranking_exp = expert_params.argsort()

    spearman_corr, p_value_s = spearmanr(true_ranking, predicted_ranking_exp)
    tau, p_value_k = kendalltau(true_ranking, predicted_ranking_exp)
    tau_distance = 1 - tau

    if (i% 100 ==0):
        print(f"\nIteration {i + 1}:")
        print(f"Updated model with {len(initial_samples)} triplets")
        print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
        print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

## Similarity sampling

In [60]:
initial_samples = random_sampling(expert_df, n_items, n_initial_samples)

expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2)
initial_ranking = expert_params.argsort()
spearman_corr, p_value_s = spearmanr(true_ranking, initial_ranking)
tau, p_value_k = kendalltau(true_ranking, initial_ranking)
tau_distance = 1 - tau

print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

Spearman: 0.04290080577938317, p_value: 0.7448271498166784
Kendall tau: 0.020338983050847456, tau distance: 0.9796610169491525


In [74]:
def similarity_sampling(dataset, n_items, n_samples, model_params):
    similar_samples = []
    predicted_ranking = model_params.argsort()
    for sample in dataset['noisy rank']:
        
        rank_values = [predicted_ranking[item] for item in sample]
        spearman_corr, _ = spearmanr(rankdata(sample), rankdata(rank_values))

        similar_samples.append((sample, abs(spearman_corr)))

    similar_samples.sort(key=lambda x: x[1], reverse = True)
    selected_samples = [sample for sample,_ in similar_samples[:n_samples]]

    return selected_samples

In [79]:

def similarity_sampling_torch(dataset, n_items, n_samples, model_params):

    model_params = torch.tensor(model_params, dtype=torch.float32, device='cuda')
    predicted_ranking = torch.argsort(model_params).float().to('cuda')

    samples = torch.tensor(dataset['noisy rank'], dtype=torch.long, device='cuda')
    sample_ranks = model_params[samples].argsort(dim=-1).float()

    rank_means = torch.mean(predicted_ranking, dim=-1, keepardim=True)
    covariance = ((sample_ranks - rank_means[:, None]) * (sample_ranks - rank_means[:, None])).sum(dim=-1)

    std_true = torch.sqrt(((sample_ranks - rank_means[:, None]) ** 2).sum(dim=-1))
    std_pred = torch.sqrt(((sample_ranks - rank_means[:, None]) ** 2).sum(dim=-1))

    spearman_corrs = covariance / (std_true * std_pred + 1e-6)

    abs_similarities = torch.abs(spearman_corrs)
    selected_indices = torch.argsort(-abs_similarities)[:n_samples]
    selected_samples = samples[selected_indices].cpu().tolist()
    return selected_samples

In [None]:
for i in range(len(expert_df)):
    new_samples = similarity_sampling_torch(expert_df, n_items, n_samples_per_iter, expert_params)
    initial_samples.extend(new_samples)

    expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2, initial_params=expert_params)
    predicted_ranking_exp = expert_params.argsort()

    spearman_corr, p_value_s = spearmanr(true_ranking, predicted_ranking_exp)
    tau, p_value_k = kendalltau(true_ranking, predicted_ranking_exp)
    tau_distance = 1 - tau

    if (i% 1 ==0):
        print(f"\nIteration {i + 1}:")
        print(f"Updated model with {len(initial_samples)} triplets")
        print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
        print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

# 1. Diversity sampling
* find most diverse response -> most information not yet seen
* low activation in logits / hidden layers (lack of information)
* difference to uncertainty (conflicting information)
* between pred probs (falcun)

In [131]:
initial_samples = random_sampling(expert_df, n_items, n_initial_samples)

expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2)
initial_ranking = expert_params.argsort()
spearman_corr, p_value_s = spearmanr(true_ranking, initial_ranking)
tau, p_value_k = kendalltau(true_ranking, initial_ranking)
tau_distance = 1 - tau

print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
print(f"Kendall tau: {tau}, tau distance: {tau_distance}")

Spearman: -0.06863017504862463, p_value: 0.6023394801599651
Kendall tau: -0.04632768361581921, tau distance: 1.0463276836158193


In [132]:
def spearman_rank_corr(sample_a, sample_b):
    rank_a = torch.argsort(torch.argsort(sample_a))
    rank_b = torch.argsort(torch.argsort(sample_b))

    d = rank_a - rank_b
    n = len(sample_a)

    spearman_corr = 1 - (6 * torch.sum(d ** 2)) / (n * (n ** 2 - 1))
    return spearman_corr

In [135]:
def diversity_sampling(dataset, n_items, n_samples, model_params):

    model_params = torch.tensor(model_params, dtype=torch.float32, device='cuda')
    selected_samples = []
    
    samples = torch.tensor(dataset['noisy rank'], dtype=torch.long, device='cuda')
    predicted_ranking = torch.argsort(model_params)
    
    spearman_distances = []

    for sample in samples:
        # Compute Spearman's rank correlation for this sample with already selected samples
        spearman_corrs = []
        
        # Compare with already selected samples
        for selected_sample in selected_samples:
            spearman_corr = spearman_rank_corr_gpu(sample, selected_sample)
            spearman_corrs.append(spearman_corr)
        
        # Compute mean Spearman correlation with already selected samples
        mean_spearman_corr = torch.mean(torch.tensor(spearman_corrs, device="cuda")) if spearman_corrs else torch.tensor(0.0, device="cuda")
        
        # Add the sample and its mean correlation (distance) to the list
        spearman_distances.append((sample, mean_spearman_corr))
    
    # Sort the samples based on their dissimilarity (lowest correlation means more dissimilar)
    spearman_distances.sort(key=lambda x: x[1], reverse=False)
    
    # Select the n_samples most dissimilar samples
    selected_samples = [sample.cpu().tolist() for sample, _ in spearman_distances[:n_samples]]
    return selected_samples

In [136]:
for i in range(len(expert_df)):
    new_samples = diversity_sampling(expert_df, n_items, n_samples_per_iter, expert_params)
    initial_samples.extend(new_samples)
    expert_params = choix.lsr_rankings(n_items=n_items, data=initial_samples, alpha=1e-2, initial_params=expert_params)
    
    predicted_ranking_exp = expert_params.argsort()

    spearman_corr, p_value_s = spearmanr(true_ranking, predicted_ranking_exp)
    tau, p_value_k = kendalltau(true_ranking, predicted_ranking_exp)
    tau_distance = 1 - tau

    if (i% 1 ==0):
        print(f"\nIteration {i + 1}:")
        print(f"Updated model with {len(initial_samples)} triplets")
        print(f"Spearman: {spearman_corr}, p_value: {p_value_s}")
        print(f"Kendall tau: {tau}, tau distance: {tau_distance}")


Iteration 1:
Updated model with 105 triplets
Spearman: 0.05234787440955822, p_value: 0.691199589107015
Kendall tau: 0.0384180790960452, tau distance: 0.9615819209039548

Iteration 2:
Updated model with 110 triplets
Spearman: 0.3110864128924702, p_value: 0.015548810112890794
Kendall tau: 0.20112994350282487, tau distance: 0.7988700564971751

Iteration 3:
Updated model with 115 triplets
Spearman: 0.013503751041956101, p_value: 0.9184361097133524
Kendall tau: 0.01694915254237288, tau distance: 0.9830508474576272

Iteration 4:
Updated model with 120 triplets
Spearman: 0.019838844123367606, p_value: 0.8804071075043407
Kendall tau: 0.0192090395480226, tau distance: 0.9807909604519774

Iteration 5:
Updated model with 125 triplets
Spearman: 0.13709363712142264, p_value: 0.2962421777778163
Kendall tau: 0.08813559322033898, tau distance: 0.9118644067796611

Iteration 6:
Updated model with 130 triplets
Spearman: 0.055737704918032795, p_value: 0.6723030232283143
Kendall tau: 0.031638418079096044,

KeyboardInterrupt: 