***
10 random seeds: range(20, 30)
for data creation for each type of spammer

invoke rc

get rc accuracy (+- std dev), wacc and tau

save in results/spammer_type/rc.csv
***

## Device Setup

In [1]:
!nvidia-smi

Sun Dec 21 08:07:38 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   51C    P0              66W / 300W |    529MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [2]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
print(f"Current PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

Current PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA version: 12.8


### Importing PGEM

In [4]:
import sys
sys.path.insert(0, "../")
sys.path.insert(1, "../../")

from spammer_types import *
from util import *
import opt_fair
from distribution_utils import crowd_bt_dist, logistic_preference_dist, comparisons_to_df, safe_kendalltau, to_numpy
from metrics import compute_acc, compute_weighted_acc
from pgem import EMWrapper

## Passage dataset

### Get the original df of passage dataset

In [5]:
df_path = "../../real_data/passage/data/passage_cleaned.csv"

In [6]:
import pandas as pd
df = pd.read_csv(df_path)
def sort_df(df, column_name):
        # Sort by a specific column (replace 'column_name' with your column)
        df_sorted = df.sort_values(by=column_name, ascending=True)  # or ascending=False

        return df_sorted
df = sort_df(df, 'performer')
df[['left', 'right', 'label', 'performer']].head()

Unnamed: 0,left,right,label,performer
0,been wicked. They believed that the end of the...,lichen Sect Content Linking Artid A snake coil...,lichen Sect Content Linking Artid A snake coil...,5
20,"school, you noticed that all the clocks read a...",man in the seat with me and the two women acro...,man in the seat with me and the two women acro...,5
19,foreign animals or plants may be taken into th...,adhere to the wall. Using sizing that has been...,foreign animals or plants may be taken into th...,5
18,many pennies did each child get Which computat...,the picture on its cover. Write two or three s...,many pennies did each child get Which computat...,5
17,direction of the Sun. The length in meters and...,"the ground going into the cave. Oh, my he said...","the ground going into the cave. Oh, my he said...",5


In [7]:
percents = [10, 20, 40, 60, 80]
# percents = [10]

In [8]:
import pickle

with open("../../real_data/passage/data/PassageDF.pickle", "rb") as handle:
    df_passage = pickle.load(handle)
df_passage

Unnamed: 0,label,score
0,"a star. Our planet, Earth, orbits, or circles,...",1
1,"Adam, We did not have plastic toys. I played w...",1
2,Who said the little owl. Who wants to hunt wit...,1
3,dead leaf. This is a mole. Moles burrow underg...,1
4,ereaddatagradepsenvironcomp.html Environment r...,1
...,...,...
467,work over the summer on any changes they wish ...,12
468,between January and December plunged the Unite...,12
469,into a newly opened bank account. I was amazed...,12
470,"occurring phenomenon, manmade by products are ...",12


In [9]:
size = len(df_passage)
print(size)
classes = [0] * size
# for faceage it would be classes = df_passage['gender']

472


In [10]:
gt_df = df_passage

### Addition of Random Guessors

In [11]:
spammer_type = "random"

In [12]:
csv_file = f"results/{spammer_type}/rc.csv"

In [13]:
import os
os.makedirs(f"results/{spammer_type}", exist_ok=True)

In [14]:
import csv
# -------------------------
# Write CSV header
# -------------------------
header = [
    "percent",
    "PGEM_acc_mean", "PGEM_acc_std",
    "PGEM_wacc_mean", "PGEM_wacc_std",
    "PGEM_tau_mean", "PGEM_tau_std"
]

with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [15]:
for percent in percents:
    # initialize metrics
    RC_accs, RC_waccs, RC_taus = [], [], []
    
    for sd in range(20, 30):
        
        # get df
        random_df, spammer_ids = add_random_spammer(df, percent, seed=sd)
        PC_faceage = df_to_pickle(random_df, df_passage)
        K = len(PC_faceage.keys())
        print(K)
        
        try:
            rc_obj = opt_fair.RankCentrality(device)
            all_pc_faceage = opt_fair._pc_without_reviewers(PC_faceage)
            A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
            P = rc_obj.trans_prob(A)
            pi = rc_obj.stationary_dist(P)
            rank_centrality_scores = np.log(to_numpy(pi))
            annot_bt_np = to_numpy(rank_centrality_scores)
            if np.isnan(annot_bt_np).any():
                continue
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
            if RC_tau < 0:
                annot_bt_np = -annot_bt_np
            RC_acc = compute_acc(gt_df, annot_bt_np, device)
            RC_wacc = compute_weighted_acc(gt_df, annot_bt_np, device)
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
        except Exception as e:
            print(f"RC failed due to {e}")
            continue
        RC_accs.append(RC_acc)
        RC_waccs.append(RC_wacc)
        RC_taus.append(RC_tau)
    
    row = [
        percent,
        np.mean(RC_accs), np.std(RC_accs),
        np.mean(RC_waccs), np.std(RC_waccs),
        np.mean(RC_taus), np.std(RC_taus)
    ]
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)
    print(
    f"RC | "
    f"Percent: {percent} |"
    f"Acc: {np.mean(RC_accs):.4f} ± {np.std(RC_accs):.4f} | "
    f"WAcc: {np.mean(RC_waccs):.4f} ± {np.std(RC_waccs):.4f} | "
    f"Tau: {np.mean(RC_taus):.4f} ± {np.std(RC_taus):.4f}")

Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
RC | Percent: 10 |Acc: 0.6499 ± 0.0015 | WAcc: 0.7105 ± 0.0020 | Tau: 0.2844 ± 0.0028
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
RC | Percent: 20 |Acc: 0.6524 ± 0.0026 | WAcc: 0.7127 ± 0.0034 | Tau: 0.2885 ± 0.0049
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
RC | Percent: 40 |

### Addition of Anti-Personas

In [16]:
spammer_type = "anti"

In [17]:
csv_file = f"results/{spammer_type}/rc.csv"

In [18]:
import os
os.makedirs(f"results/{spammer_type}", exist_ok=True)

In [19]:
import csv
# -------------------------
# Write CSV header
# -------------------------
header = [
    "percent",
    "PGEM_acc_mean", "PGEM_acc_std",
    "PGEM_wacc_mean", "PGEM_wacc_std",
    "PGEM_tau_mean", "PGEM_tau_std"
]

with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [20]:
for percent in percents:
    # initialize metrics
    RC_accs, RC_waccs, RC_taus = [], [], []
    
    for sd in range(20, 30):
        
        # get df
        random_df, spammer_ids = add_anti_personas(df, percent, seed=sd)
        PC_faceage = df_to_pickle(random_df, df_passage)
        K = len(PC_faceage.keys())
        print(K)
        
        try:
            rc_obj = opt_fair.RankCentrality(device)
            all_pc_faceage = opt_fair._pc_without_reviewers(PC_faceage)
            A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
            P = rc_obj.trans_prob(A)
            pi = rc_obj.stationary_dist(P)
            rank_centrality_scores = np.log(to_numpy(pi))
            annot_bt_np = to_numpy(rank_centrality_scores)
            if np.isnan(annot_bt_np).any():
                continue
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
            if RC_tau < 0:
                annot_bt_np = -annot_bt_np
            RC_acc = compute_acc(gt_df, annot_bt_np, device)
            RC_wacc = compute_weighted_acc(gt_df, annot_bt_np, device)
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
        except Exception as e:
            print(f"RC failed due to {e}")
            continue
        RC_accs.append(RC_acc)
        RC_waccs.append(RC_wacc)
        RC_taus.append(RC_tau)
    
    row = [
        percent,
        np.mean(RC_accs), np.std(RC_accs),
        np.mean(RC_waccs), np.std(RC_waccs),
        np.mean(RC_taus), np.std(RC_taus)
    ]
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)
    print(
    f"RC | "
    f"Percent: {percent} |"
    f"Acc: {np.mean(RC_accs):.4f} ± {np.std(RC_accs):.4f} | "
    f"WAcc: {np.mean(RC_waccs):.4f} ± {np.std(RC_waccs):.4f} | "
    f"Tau: {np.mean(RC_taus):.4f} ± {np.std(RC_taus):.4f}")

Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
RC | Percent: 10 |Acc: 0.6501 ± 0.0005 | WAcc: 0.7104 ± 0.0005 | Tau: 0.2839 ± 0.0009
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
RC | Percent: 20 |Acc: 0.6521 ± 0.0006 | WAcc: 0.7128 ± 0.0007 | Tau: 0.2874 ± 0.0011
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
RC | Percent: 40 |

### Addition of Left-Position biased Spammers

In [21]:
spammer_type = "left"

In [22]:
csv_file = f"results/{spammer_type}/rc.csv"

In [23]:
import os
os.makedirs(f"results/{spammer_type}", exist_ok=True)

In [24]:
import csv
# -------------------------
# Write CSV header
# -------------------------
header = [
    "percent",
    "PGEM_acc_mean", "PGEM_acc_std",
    "PGEM_wacc_mean", "PGEM_wacc_std",
    "PGEM_tau_mean", "PGEM_tau_std"
]

with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [25]:
for percent in percents:
    # initialize metrics
    RC_accs, RC_waccs, RC_taus = [], [], []
    
    for sd in range(20, 30):
        
        # get df
        random_df, spammer_ids = add_position_biased_spammers(df, percent,position_bias="left", seed=sd)
        PC_faceage = df_to_pickle(random_df, df_passage)
        K = len(PC_faceage.keys())
        print(K)
        
        try:
            rc_obj = opt_fair.RankCentrality(device)
            all_pc_faceage = opt_fair._pc_without_reviewers(PC_faceage)
            A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
            P = rc_obj.trans_prob(A)
            pi = rc_obj.stationary_dist(P)
            rank_centrality_scores = np.log(to_numpy(pi))
            annot_bt_np = to_numpy(rank_centrality_scores)
            if np.isnan(annot_bt_np).any():
                continue
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
            if RC_tau < 0:
                annot_bt_np = -annot_bt_np
            RC_acc = compute_acc(gt_df, annot_bt_np, device)
            RC_wacc = compute_weighted_acc(gt_df, annot_bt_np, device)
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
        except Exception as e:
            print(f"RC failed due to {e}")
            continue
        RC_accs.append(RC_acc)
        RC_waccs.append(RC_wacc)
        RC_taus.append(RC_tau)
    
    row = [
        percent,
        np.mean(RC_accs), np.std(RC_accs),
        np.mean(RC_waccs), np.std(RC_waccs),
        np.mean(RC_taus), np.std(RC_taus)
    ]
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)
    print(
    f"RC | "
    f"Percent: {percent} |"
    f"Acc: {np.mean(RC_accs):.4f} ± {np.std(RC_accs):.4f} | "
    f"WAcc: {np.mean(RC_waccs):.4f} ± {np.std(RC_waccs):.4f} | "
    f"Tau: {np.mean(RC_taus):.4f} ± {np.std(RC_taus):.4f}")

Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
RC | Percent: 10 |Acc: 0.6455 ± 0.0004 | WAcc: 0.7048 ± 0.0004 | Tau: 0.2757 ± 0.0007
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
RC | Percent: 20 |Acc: 0.6445 ± 0.0005 | WAcc: 0.7022 ± 0.0005 | Tau: 0.2737 ± 0.0008
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
RC | Percent: 40 |

### Addition of Right-Position biased spammers

In [26]:
spammer_type = "right"

In [27]:
csv_file = f"results/{spammer_type}/rc.csv"

In [28]:
import os
os.makedirs(f"results/{spammer_type}", exist_ok=True)

In [29]:
import csv
# -------------------------
# Write CSV header
# -------------------------
header = [
    "percent",
    "PGEM_acc_mean", "PGEM_acc_std",
    "PGEM_wacc_mean", "PGEM_wacc_std",
    "PGEM_tau_mean", "PGEM_tau_std"
]

with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [30]:
for percent in percents:
    # initialize metrics
    RC_accs, RC_waccs, RC_taus = [], [], []
    
    for sd in range(20, 30):
        
        # get df
        random_df, spammer_ids = add_position_biased_spammers(df, percent,position_bias="right", seed=sd)
        PC_faceage = df_to_pickle(random_df, df_passage)
        K = len(PC_faceage.keys())
        print(K)
        
        try:
            rc_obj = opt_fair.RankCentrality(device)
            all_pc_faceage = opt_fair._pc_without_reviewers(PC_faceage)
            A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
            P = rc_obj.trans_prob(A)
            pi = rc_obj.stationary_dist(P)
            rank_centrality_scores = np.log(to_numpy(pi))
            annot_bt_np = to_numpy(rank_centrality_scores)
            if np.isnan(annot_bt_np).any():
                continue
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
            if RC_tau < 0:
                annot_bt_np = -annot_bt_np
            RC_acc = compute_acc(gt_df, annot_bt_np, device)
            RC_wacc = compute_weighted_acc(gt_df, annot_bt_np, device)
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
        except Exception as e:
            print(f"RC failed due to {e}")
            continue
        RC_accs.append(RC_acc)
        RC_waccs.append(RC_wacc)
        RC_taus.append(RC_tau)
    
    row = [
        percent,
        np.mean(RC_accs), np.std(RC_accs),
        np.mean(RC_waccs), np.std(RC_waccs),
        np.mean(RC_taus), np.std(RC_taus)
    ]
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)
    print(
    f"RC | "
    f"Percent: {percent} |"
    f"Acc: {np.mean(RC_accs):.4f} ± {np.std(RC_accs):.4f} | "
    f"WAcc: {np.mean(RC_waccs):.4f} ± {np.std(RC_waccs):.4f} | "
    f"Tau: {np.mean(RC_taus):.4f} ± {np.std(RC_taus):.4f}")

Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
RC | Percent: 10 |Acc: 0.6532 ± 0.0004 | WAcc: 0.7145 ± 0.0006 | Tau: 0.2909 ± 0.0008
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
Unique performers: 748
748
RC | Percent: 20 |Acc: 0.6523 ± 0.0009 | WAcc: 0.7134 ± 0.0011 | Tau: 0.2890 ± 0.0017
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
Unique performers: 873
873
RC | Percent: 40 |

### Addition of Equal Proportion of all four types of spammers

In [31]:
spammer_type = "equal"

In [32]:
csv_file = f"results/{spammer_type}/rc.csv"

In [33]:
import os
os.makedirs(f"results/{spammer_type}", exist_ok=True)

In [34]:
import csv
# -------------------------
# Write CSV header
# -------------------------
header = [
    "percent",
    "PGEM_acc_mean", "PGEM_acc_std",
    "PGEM_wacc_mean", "PGEM_wacc_std",
    "PGEM_tau_mean", "PGEM_tau_std"
]

with open(csv_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [35]:
for percent in percents:
    # initialize metrics
    RC_accs, RC_waccs, RC_taus = [], [], []
    
    for sd in range(20, 30):
        
        # get df
        random_df, spammer_ids = add_equal_proportion_of_all_spammers(df, percent, seed=sd)
        PC_faceage = df_to_pickle(random_df, df_passage)
        K = len(PC_faceage.keys())
        print(K)
        
        try:
            rc_obj = opt_fair.RankCentrality(device)
            all_pc_faceage = opt_fair._pc_without_reviewers(PC_faceage)
            A = rc_obj.matrix_of_comparisons(size, all_pc_faceage)
            P = rc_obj.trans_prob(A)
            pi = rc_obj.stationary_dist(P)
            rank_centrality_scores = np.log(to_numpy(pi))
            annot_bt_np = to_numpy(rank_centrality_scores)
            if np.isnan(annot_bt_np).any():
                continue
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
            if RC_tau < 0:
                annot_bt_np = -annot_bt_np
            RC_acc = compute_acc(gt_df, annot_bt_np, device)
            RC_wacc = compute_weighted_acc(gt_df, annot_bt_np, device)
            RC_tau = safe_kendalltau(annot_bt_np, gt_df['score'].to_numpy())
        except Exception as e:
            print(f"RC failed due to {e}")
            continue
        RC_accs.append(RC_acc)
        RC_waccs.append(RC_wacc)
        RC_taus.append(RC_tau)
    
    row = [
        percent,
        np.mean(RC_accs), np.std(RC_accs),
        np.mean(RC_waccs), np.std(RC_waccs),
        np.mean(RC_taus), np.std(RC_taus)
    ]
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(row)
    print(
    f"RC | "
    f"Percent: {percent} |"
    f"Acc: {np.mean(RC_accs):.4f} ± {np.std(RC_accs):.4f} | "
    f"WAcc: {np.mean(RC_waccs):.4f} ± {np.std(RC_waccs):.4f} | "
    f"Tau: {np.mean(RC_taus):.4f} ± {np.std(RC_taus):.4f}")

Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
Unique performers: 686
686
RC | Percent: 10 |Acc: 0.6522 ± 0.0015 | WAcc: 0.7129 ± 0.0015 | Tau: 0.2886 ± 0.0026
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
Unique performers: 749
749
RC | Percent: 20 |Acc: 0.6525 ± 0.0032 | WAcc: 0.7139 ± 0.0039 | Tau: 0.2884 ± 0.0061
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
Unique performers: 874
874
RC | Percent: 40 |