In [1]:
import pandas as pd

In [2]:
def filter_zeroes_rank(rank):
    return rank[rank != 0]

def reciprocal_rank_fusion(rank, c):
    return 1/(c + rank)

def squared_rrf(rank):
    return 1/(rank**2)

def rank_value(rank, c):
    return 1/(rank + c)**2

def mandelbrot_rank(rank,c,n):
    denom = sum([rank_value(i, c) for i in range(1, n+1)])
    num = rank_value(rank, c)
    return num/denom

In [3]:
experiment_df = pd.read_csv('./MQ2008-agg/transformed_agg.csv', sep='\t')
experiment_df.head(5)

Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_16,ranking_17,ranking_18,ranking_19,ranking_20,ranking_21,ranking_22,ranking_23,ranking_24,ranking_25
0,10002,GX008-86-4444840,0,1,30,48,133,0,265,0,...,0,0,0,0,0,0,287,75,0,0
1,10002,GX037-06-11625428,0,0,0,0,0,0,0,0,...,0,18,100,120,0,0,0,0,0,0
2,10002,GX044-30-4142998,0,0,0,0,0,0,8,0,...,125,0,0,0,0,0,4,0,0,0
3,10002,GX228-42-3888699,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,13
4,10002,GX229-14-12863205,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [4]:
def apply_new_ranking(col_value, old_rank):
    # print(f"column_value: {col_value}")
    # print(f"old_rank: {old_rank}")
    if int(col_value) > 0:
        return int(old_rank)
    else:
        return int(0)

ranking_cols = [col for col in experiment_df.columns if col.startswith('ranking_')]
data_list = []
for qid, data in experiment_df.groupby('qid'):
    data_copy = data.copy()
    for col in ranking_cols:
        data_copy[f"{col}_rank_old"] = data_copy[col].rank(method='dense', ascending=False).astype(int)
        data_copy[f"{col}_rank_new"] = data_copy[[col,f"{col}_rank_old"]].apply(lambda x:apply_new_ranking(x[0], x[1]), axis=1)
        data_copy = data_copy.drop(columns=[f"{col}_rank_old"])
        data_copy = data_copy.rename(columns={f"{col}_rank_new":f"{col}_rank"})
    data_list.append(data_copy)

experiment_df = pd.concat(data_list)    
experiment_df.head(5) 

Unnamed: 0,qid,docid,relevance,ranking_1,ranking_2,ranking_3,ranking_4,ranking_5,ranking_6,ranking_7,...,ranking_16_rank,ranking_17_rank,ranking_18_rank,ranking_19_rank,ranking_20_rank,ranking_21_rank,ranking_22_rank,ranking_23_rank,ranking_24_rank,ranking_25_rank
0,10002,GX008-86-4444840,0,1,30,48,133,0,265,0,...,0,0,0,0,0,0,1,1,0,0
1,10002,GX037-06-11625428,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
2,10002,GX044-30-4142998,0,0,0,0,0,0,8,0,...,1,0,0,0,0,0,4,0,0,0
3,10002,GX228-42-3888699,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,10002,GX229-14-12863205,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,2


In [5]:
# rrf set to c = 60
# mrrf set to c = 60, n = 10
import numpy as np

ranked = [col for col in experiment_df.columns if col.endswith('_rank')]
data_applied_functions = []
for qid, data in experiment_df.groupby('qid'):
    data['avg_rank'] = filter_zeroes_rank(data[ranked]).mean(axis=1)
    data['avg_rank'] = data['avg_rank'].replace(np.nan, 0)
    data['rrf'] = reciprocal_rank_fusion(filter_zeroes_rank(data[ranked]),60).sum(axis=1)
    data['squared_rrf'] = squared_rrf(filter_zeroes_rank(data[ranked])).sum(axis=1)
    data['mrrf'] = mandelbrot_rank(filter_zeroes_rank(data[ranked]), 60, 25).sum(axis=1)
    data_applied_functions.append(data)

result_df = pd.concat(data_applied_functions)
result_df.head(5)    


NameError: name 'np' is not defined

In [None]:
result_df[result_df['docid'] == 'GX000-00-0000000']

In [None]:
result_df.to_csv('./MQ2008-agg/result_agg_functions_MQ2008_top25.csv', sep='\t', index=False)