# Comparison of Label Ranking via RPC and CC

In [None]:
%pip install scipy==1.8.1
%pip install pandas==1.4.2
%pip install scikit-learn==1.1.1

## Load Dataset

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn.linear_model as lm
import sklearn.model_selection as ms

In [3]:
data = pd.read_csv("./segment_lr_dataset.txt", sep="\t", header=0, skiprows=[1])
K = 7
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A16,A17,A18,L1,L2,L3,L4,L5,L6,L7
0,1.275912,0.949531,2.410147,-0.19451,-0.392976,-0.115095,-0.363873,-0.130916,0.591412,0.560071,...,0.700920,-0.472591,-0.438515,6,4,3,2,5,1,7
1,-0.163301,0.114514,-0.356970,-0.19451,-0.598820,-0.121713,-0.579319,-0.133963,-0.947253,-0.936767,...,-0.992082,2.510221,-0.492031,3,7,2,5,1,6,4
2,1.056604,-1.433747,-0.356970,-0.19451,-0.351806,-0.110089,-0.363873,-0.122733,2.252318,2.256706,...,2.204955,-0.996656,-0.606356,5,1,7,2,6,3,4
3,-1.273551,0.862550,-0.356970,-0.19451,-0.063626,-0.087582,1.821364,-0.025407,0.171336,0.192206,...,0.180594,-0.700711,-0.411537,5,6,3,2,4,1,7
4,-0.876054,1.280058,-0.356970,-0.19451,-0.166549,-0.093518,0.051629,-0.107432,0.328501,0.325398,...,0.382513,-0.542982,-0.426686,4,6,3,2,5,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,-1.300964,-0.372580,-0.356970,-0.19451,-0.248886,-0.124665,-0.302318,-0.126569,-0.439861,-0.356426,...,-0.469167,-0.200754,-0.124376,1,7,3,4,2,6,5
2306,0.247903,-1.729483,-0.356970,-0.19451,-0.228300,-0.107075,-0.425429,-0.120775,2.372628,2.421618,...,2.248965,-1.127838,-0.638278,4,1,6,2,5,7,3
2307,-0.615625,-0.894465,-0.356970,-0.19451,-0.248886,-0.104927,-0.271540,-0.120320,0.574921,0.528359,...,0.682799,-0.491819,-0.470657,6,3,4,1,5,2,7
2308,-0.368903,0.166702,-0.356970,-0.19451,-0.495898,-0.123470,-0.563930,-0.134599,-0.945312,-0.936767,...,-0.986905,2.510221,-0.492031,3,7,2,5,1,6,4


## RPC Implementation

In [4]:
def to_pairwise_binary_dataset(data: pd.DataFrame, i, j):
  classes = data[f"L{i+1}"] < data[f"L{j+1}"]
  return data[data.columns[:-K]], classes

def train_pairwise_model(data, i, j):
  X, y = to_pairwise_binary_dataset(data, i, j)
  m = lm.LogisticRegression(random_state=42)
  m.fit(X, y)
  return m

class RPCModel:
  def __init__(self):
    self.models = None

  def fit(self, data):
    self.features = data.shape[1] - K
    self.models = {
      (i,j): train_pairwise_model(data, i, j)
      for i in range(K) for j in range(i+1, K)}

  def predict(self, data):
    assert self.models is not None, "RPC model has to be trained first."
    borda_scores = np.zeros((data.shape[0], K), dtype=np.float32)
    data = data[data.columns[:self.features]]
    for (i, j), m in self.models.items():
      preds = m.predict_proba(data)
      borda_scores[:,i] += preds[:,1]
      borda_scores[:,j] += preds[:,0]

    return sp.stats.rankdata(-borda_scores, method="ordinal", axis=-1)

## CC Implementation

In [5]:
def to_global_binary_dataset(data):
  f_dim = data.shape[-1] - K
  N = data.shape[0]
  data_arr = data.to_numpy()[:,:f_dim]
  pair_count = K * (K-1) // 2
  X = np.zeros((N * pair_count, f_dim * K), dtype=np.float32)
  y = np.zeros((N * pair_count,), dtype=np.int32)
  k = 0
  
  for i in range(K):
    for j in range(i+1, K):
      X[k:k+N, i*f_dim:(i+1)*f_dim] = data_arr
      X[k:k+N, j*f_dim:(j+1)*f_dim] = -data_arr
      y[k:k+N] = data[f"L{i+1}"] < data[f"L{j+1}"]
      k += N
  
  return X, y

def train_cc_model(data):
  X, y = to_global_binary_dataset(data)
  m = lm.LogisticRegression(random_state=42, max_iter=400)
  m.fit(X, y)
  return m

class CCModel:
  def __init__(self):
    self.model = None
  
  def fit(self, data):
    self.features = data.shape[1] - K
    self.model = train_cc_model(data)
    
  def predict(self, data):
    assert self.model is not None, "CC model has to be trained first."
    c = self.model.coef_
    label_weight_matrix = c.reshape((K, self.features)).transpose()
    data_arr = data.to_numpy()[:,:self.features]
    label_utils = np.matmul(data_arr, label_weight_matrix)
    return sp.stats.rankdata(-label_utils, method="ordinal", axis=-1)

## Cross-Validation

In [6]:
def kendalls_tau(data, pred_rankings):
  true_rankings = data.to_numpy()[:,-K:]
  taus = np.array([sp.stats.kendalltau(tr, pr)[0] for tr, pr in zip(true_rankings, pred_rankings)])
  return taus.mean()

def spearmans_rho(data, pred_rankings):
  true_rankings = data.to_numpy()[:,-K:]
  return sp.stats.spearmanr(true_rankings, pred_rankings,axis=None).correlation

def mean_std(vals):
  return np.mean(vals), np.std(vals)

In [7]:
folds = ms.KFold(n_splits=10, shuffle=True, random_state=42).split(data)
folds = [(data.iloc[train_idxs], data.iloc[test_idxs]) for train_idxs, test_idxs in folds]

def evaluate_fold(models, train_data, test_data):
  results = []
  for model_class in models:
    model = model_class()
    model.fit(train_data)
    preds = model.predict(test_data)
    results.append(kendalls_tau(test_data, preds))
    results.append(spearmans_rho(test_data, preds))
  return results

rpc_tau, rpc_rho, cc_tau, cc_rho = map(mean_std, zip(*map(lambda fold: evaluate_fold([RPCModel, CCModel], *fold), folds)))

In [8]:
rpc_dist = (1 - rpc_tau[0]) * K * (K-1) / 4, rpc_tau[1] * K * (K-1) / 4
cc_dist = (1 - cc_tau[0]) * K * (K-1) / 4, cc_tau[1] * K * (K-1) / 4

results = pd.DataFrame(
  data=[
    ["RPC", *rpc_dist, *rpc_tau, *rpc_rho], 
    ["CC", *cc_dist, *cc_tau, *cc_rho]],
  columns=["name", "dist", "distStd", "tau", "tauStd", "rho", "rhoStd"])
results.to_csv("exercise06.csv", index=False)
results

Unnamed: 0,name,dist,distStd,tau,tauStd,rho,rhoStd
0,RPC,0.737662,0.054799,0.929746,0.005219,0.96662,0.003512
1,CC,2.077922,0.119091,0.802103,0.011342,0.885281,0.010195
