In [1]:
import pandas as pd

In [2]:
def filter_zeroes_rank(rank):
    return rank[rank != 0]

def reciprocal_rank_fusion(rank, c):
    return 1/(c + rank)

def squared_rrf(rank):
    return 1/(rank**2)

def rank_value(rank, c):
    return 1/(rank + c)**2

def mandelbrot_rank(rank,c,n):
    denom = sum([rank_value(i, c) for i in range(1, n+1)])
    num = rank_value(rank, c)
    return num/denom

In [3]:
experiment_df = pd.read_csv('./MQ2007-agg/transformed_agg.csv', sep='\t')
experiment_df.head(5)

Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_12,ranking_13,ranking_14,ranking_15,ranking_16,ranking_17,ranking_18,ranking_19,ranking_20,ranking_21
0,10,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,GX000-24-12369390,1,107,107,85,0,113,110,110,...,252,36,52,244,155,0,113,132,44,131
2,10,GX000-62-7863450,1,0,0,0,1,8,14,14,...,18,252,227,6,214,0,0,120,0,0
3,10,GX016-48-5543459,1,0,0,111,4,1,3,3,...,4,15,26,15,96,4,163,2,152,283
4,10,GX037-87-3082362,0,160,160,117,57,102,209,209,...,162,229,205,142,190,0,0,12,4,44


In [10]:
print(experiment_df.qid.nunique())
print(experiment_df.shape)
print(experiment_df.relevance.unique())

1692
(69623, 45)
[0 1 2]


In [4]:
def apply_new_ranking(col_value, old_rank):
    # print(f"column_value: {col_value}")
    # print(f"old_rank: {old_rank}")
    if int(col_value) > 0:
        return int(old_rank)
    else:
        return int(0)

ranking_cols = [col for col in experiment_df.columns if col.startswith('ranking_')]
data_list = []
for qid, data in experiment_df.groupby('qid'):
    data_copy = data.copy()
    for col in ranking_cols:
        data_copy[f"{col}_rank_old"] = data_copy[col].rank(method='dense', ascending=False).astype(int)
        data_copy[f"{col}_rank_new"] = data_copy[[col,f"{col}_rank_old"]].apply(lambda x:apply_new_ranking(x[0], x[1]), axis=1)
        data_copy = data_copy.drop(columns=[f"{col}_rank_old"])
        data_copy = data_copy.rename(columns={f"{col}_rank_new":f"{col}_rank"})
    data_list.append(data_copy)

experiment_df = pd.concat(data_list)    
experiment_df.head(5) 

Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_12_rank,ranking_13_rank,ranking_14_rank,ranking_15_rank,ranking_16_rank,ranking_17_rank,ranking_18_rank,ranking_19_rank,ranking_20_rank,ranking_21_rank
0,10,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,GX000-24-12369390,1,107,107,85,0,113,110,110,...,3,9,8,3,8,0,7,6,14,6
2,10,GX000-62-7863450,1,0,0,0,1,8,14,14,...,21,4,4,29,4,0,0,7,0,0
3,10,GX016-48-5543459,1,0,0,111,4,1,3,3,...,29,11,10,26,10,3,1,18,7,1
4,10,GX037-87-3082362,0,160,160,117,57,102,209,209,...,12,7,7,17,7,0,0,13,22,17


In [5]:
# rrf set to c = 60
# mrrf set to c = 60, n = 10
import numpy as np

ranked = [col for col in experiment_df.columns if col.endswith('_rank')]
data_applied_functions = []
for qid, data in experiment_df.groupby('qid'):
    data['avg_rank'] = filter_zeroes_rank(data[ranked]).mean(axis=1)
    data['avg_rank'] = data['avg_rank'].replace(np.nan, 0)
    data['rrf'] = reciprocal_rank_fusion(filter_zeroes_rank(data[ranked]),60).sum(axis=1)
    data['squared_rrf'] = squared_rrf(filter_zeroes_rank(data[ranked])).sum(axis=1)
    data['mrrf'] = mandelbrot_rank(filter_zeroes_rank(data[ranked]), 60, 21).sum(axis=1)
    data_applied_functions.append(data)

result_df = pd.concat(data_applied_functions)
result_df.head(5)    


Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_16_rank,ranking_17_rank,ranking_18_rank,ranking_19_rank,ranking_20_rank,ranking_21_rank,avg_rank,rrf,squared_rrf,mrrf
0,10,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,10,GX000-24-12369390,1,107,107,85,0,113,110,110,...,8,0,7,6,14,6,9.470588,0.246088,0.435389,0.840659
2,10,GX000-62-7863450,1,0,0,0,1,8,14,14,...,4,0,0,7,0,0,16.083333,0.160114,0.228318,0.509268
3,10,GX016-48-5543459,1,0,0,111,4,1,3,3,...,10,3,1,18,7,1,15.777778,0.241809,2.211949,0.776428
4,10,GX037-87-3082362,0,160,160,117,57,102,209,209,...,7,0,0,13,22,17,11.294118,0.240359,0.369523,0.804108


In [6]:
result_df[result_df['docid'] == 'GX000-00-0000000']

Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_16_rank,ranking_17_rank,ranking_18_rank,ranking_19_rank,ranking_20_rank,ranking_21_rank,avg_rank,rrf,squared_rrf,mrrf
0,10,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
120,34,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
200,42,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
240,44,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
320,50,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69183,9968,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
69263,9976,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
69303,9979,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
69423,9991,GX000-00-0000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [7]:
result_df.to_csv('./MQ2007-agg/result_agg_functions_mq2007_top21.csv', sep='\t', index=False)