In [17]:
import pandas as pd
import numpy as np
from functools import partial
from typing import Tuple, List
from scipy.stats import ttest_1samp

In [2]:
qrels = pd.read_csv('./qrels.csv')

In [3]:
sys_results = pd.read_csv('./system_results.csv')

In [4]:
qrels

Unnamed: 0,query_id,doc_id,relevance
0,1,9090,3
1,1,6850,2
2,1,9574,2
3,1,8709,1
4,1,9684,1
...,...,...,...
115,10,7346,3
116,10,8840,2
117,10,3258,2
118,10,5175,1


In [5]:
sys_results

Unnamed: 0,system_number,query_number,doc_number,rank_of_doc,score
0,1,1,6567,1,5.0743
1,1,1,9652,2,4.4829
2,1,1,9684,3,4.3478
3,1,1,7844,4,4.3268
4,1,1,9584,5,4.2160
...,...,...,...,...,...
29299,6,10,5809,496,1.0113
29300,6,10,5762,497,1.0113
29301,6,10,5696,498,1.0113
29302,6,10,5618,499,1.0113


In [6]:
def filter_on_kwargs(data:pd.DataFrame, **kwargs) -> pd.DataFrame:
    construct_query = []
    for key, item in kwargs.items():
        construct_query.append(key + " == " + str(item))
    construct_query = ' & '.join(construct_query)
    return data.query(construct_query)

In [7]:
def get_number_of_unique_rows(data:pd.DataFrame, col_name:str) -> int:
    return len(set(data[col_name]))

def get_row_number(system_nr:int, query_nr:int, nr_queries:int) -> int:
    ii = system_nr - 1
    jj = query_nr - 1
    return ii * nr_queries + jj

nr_queries = get_number_of_unique_rows(sys_results, 'query_number')
nr_systems = get_number_of_unique_rows(sys_results, 'system_number')

def create_result_df(nr_systems:int, nr_queries:int) -> pd.DataFrame:
    col_names = ['system_number', 'query_number', 'P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']
    result_df = pd.DataFrame(np.zeros((nr_systems * nr_queries, len(col_names))), columns=col_names)
    result_df['system_number'] = pd.to_numeric(result_df['system_number'], downcast='integer')
    result_df['query_number'] = pd.to_numeric(result_df['query_number'], downcast='integer')
    
    for ii in range(nr_systems):
        for jj in range(nr_queries):
            row_nr = get_row_number(ii+1, jj+1, nr_queries)
            result_df.at[row_nr, 'system_number'] = ii+1
            result_df.at[row_nr, 'query_number'] = jj+1
    return result_df

result_df = create_result_df(nr_systems, nr_queries)

In [8]:
def calculate_precision_at_10(sys_results:pd.DataFrame, qrels:pd.DataFrame, system_number:int, 
                             query_number:int) -> float:
    retrieved_docs = list(filter_on_kwargs(sys_results, system_number=system_number, 
                                           query_number=query_number)['doc_number'])
    relevant_docs = set(filter_on_kwargs(qrels, query_id=query_number)['doc_id'])
    
    TP = 0
    for doc in retrieved_docs[:10]:
        if doc in relevant_docs:
            TP += 1
    return TP/10

def calculate_all_precision_at_10(sys_results:pd.DataFrame, qrels:pd.DataFrame, nr_systems:int, 
                                 nr_queries:int, result_df:pd.DataFrame):
    for ii in range(1, nr_systems+1):
        for jj in range(1, nr_queries+1):
            precision = calculate_precision_at_10(sys_results, qrels, ii, jj)
            result_df.at[get_row_number(ii, jj, nr_queries), 'P@10'] = precision

calculate_all_precision_at_10(sys_results, qrels, nr_systems, nr_queries, result_df)

In [9]:
def calculate_recall_at_50(sys_results:pd.DataFrame, qrels:pd.DataFrame, system_number:int, 
                             query_number:int) -> float:
    retrieved_docs = list(filter_on_kwargs(sys_results, system_number=system_number, 
                                           query_number=query_number)['doc_number'])
    relevant_docs = set(filter_on_kwargs(qrels, query_id=query_number)['doc_id'])
    
    TP = 0
    for doc in retrieved_docs[:50]:
        if doc in relevant_docs:
            TP += 1

    FN = len(relevant_docs.difference(set(retrieved_docs[:50])))
    return TP/(TP + FN)

def calculate_all_recall_at_50(sys_results:pd.DataFrame, qrels:pd.DataFrame, nr_systems:int, 
                                 nr_queries:int, result_df:pd.DataFrame):
    for ii in range(1, nr_systems+1):
        for jj in range(1, nr_queries+1):
            recall = calculate_recall_at_50(sys_results, qrels, ii, jj)
            result_df.at[get_row_number(ii, jj, nr_queries), 'R@50'] = recall

calculate_all_recall_at_50(sys_results, qrels, nr_systems, nr_queries, result_df)

In [10]:
def calculate_R_precision(sys_results:pd.DataFrame, qrels:pd.DataFrame, system_number:int, 
                          query_number:int) -> float:
    retrieved_docs = list(filter_on_kwargs(sys_results, system_number=system_number, 
                                           query_number=query_number)['doc_number'])
    relevant_docs = set(filter_on_kwargs(qrels, query_id=query_number)['doc_id'])
    R = len(relevant_docs)
    
    TP = 0
    for doc in retrieved_docs[:R]:
        if doc in relevant_docs:
            TP += 1
    return TP/R

def calculate_all_R_precision(sys_results:pd.DataFrame, qrels:pd.DataFrame, nr_systems:int, 
                              nr_queries:int, result_df:pd.DataFrame):
    for ii in range(1, nr_systems+1):
        for jj in range(1, nr_queries+1):
            R_precision = calculate_R_precision(sys_results, qrels, ii, jj)
            result_df.at[get_row_number(ii, jj, nr_queries), 'r-precision'] = R_precision

calculate_all_R_precision(sys_results, qrels, nr_systems, nr_queries, result_df)

In [11]:
def calculate_AP(sys_results:pd.DataFrame, qrels:pd.DataFrame, system_number:int, 
                 query_number:int) -> float:
    retrieved_docs = list(filter_on_kwargs(sys_results, system_number=system_number, 
                                           query_number=query_number)['doc_number'])
    relevant_docs = set(filter_on_kwargs(qrels, query_id=query_number)['doc_id'])
    
    R = len(relevant_docs)
    
    AP = 0
    TP = 0
    for ii, doc in enumerate(retrieved_docs):
        if doc in relevant_docs:
            TP += 1
            AP += TP/(ii+1)
    AP = AP/R
    return AP

def calculate_all_AP(sys_results:pd.DataFrame, qrels:pd.DataFrame, nr_systems:int, 
                              nr_queries:int, result_df:pd.DataFrame):
    for ii in range(1, nr_systems+1):
        for jj in range(1, nr_queries+1):
            AP = calculate_AP(sys_results, qrels, ii, jj)
            result_df.at[get_row_number(ii, jj, nr_queries), 'AP'] = AP

calculate_all_AP(sys_results, qrels, nr_systems, nr_queries, result_df)

In [35]:
def calculate_nDCG(sys_results:pd.DataFrame, qrels:pd.DataFrame, system_number:int, 
                   query_number:int) -> float:
    retrieved_docs = list(filter_on_kwargs(sys_results, system_number=system_number, 
                                           query_number=query_number)['doc_number'])
    qrels_subset = filter_on_kwargs(qrels, query_id=query_number)
    
    relevant_doc_ids = list(qrels_subset['doc_id'])
    relevant_doc_relevance = list(qrels_subset['relevance'])
    relevant_dict = dict()
    for ii, doc_id in enumerate(relevant_doc_ids):
        relevant_dict[doc_id] = relevant_doc_relevance[ii]
    
    nDCG_10 = 0
    nDCG_20 = 0
    DCG_10 = 0
    DCG_20 = 0
    iDCG_k = 0
    sorted_doc_relevance = sorted(relevant_doc_relevance, reverse=True)
    for ii, doc in enumerate(retrieved_docs[:20]):
        if doc in relevant_dict:
            if ii == 0:
                DCG_10 += relevant_dict[doc]
                DCG_20 += relevant_dict[doc]
            elif ii < 10:
                DCG_10 += relevant_dict[doc] / np.log2(ii+1)
                DCG_20 += relevant_dict[doc] / np.log2(ii+1)
            else:
                DCG_20 += relevant_dict[doc] / np.log2(ii+1)
            
        if ii == 0:
            iDCG_k += sorted_doc_relevance[ii]
        elif ii < len(sorted_doc_relevance):
            iDCG_k += sorted_doc_relevance[ii] / np.log2(ii+1)
            
        if ii == 9:
            nDCG_10 = DCG_10 / iDCG_k
            
        if ii == 19:
            nDCG_20 = DCG_20 / iDCG_k
    return nDCG_10, nDCG_20

def calculate_all_nDCG(sys_results:pd.DataFrame, qrels:pd.DataFrame, nr_systems:int, 
                       nr_queries:int, result_df:pd.DataFrame):
    for ii in range(1, nr_systems+1):
        for jj in range(1, nr_queries+1):
            nDCG_10, nDCG_20 = calculate_nDCG(sys_results, qrels, ii, jj)
            result_df.at[get_row_number(ii, jj, nr_queries), 'nDCG@10'] = nDCG_10
            result_df.at[get_row_number(ii, jj, nr_queries), 'nDCG@20'] = nDCG_20

calculate_all_nDCG(sys_results, qrels, nr_systems, nr_queries, result_df)

In [36]:
def print_result_df(result_df:pd.DataFrame, nr_systems:int, nr_queries:int, file_name:str='ir_eval.csv'):
    score_means = np.zeros((nr_systems, 6))
    with open(file_name, 'w') as f:
        f.write(','.join(result_df.columns)+'\n')
        for ii in range(nr_systems):
            for jj in range(nr_queries):
                row_nr = get_row_number(ii+1, jj+1, nr_queries)
                row = result_df.loc[row_nr, :]
                line = str(int(row[0])) + ',' + str(int(row[1])) + ','
                
                rest_of_line = ','.join(map(str, [round(x, 3) for x in row[2:]]))
                line += rest_of_line + '\n'
                f.write(line)
            line = str(ii+1) + "," + "mean" + ","
            row_nr = get_row_number(ii+1, 1, nr_queries)
            relevant_stats = np.array(result_df.iloc[row_nr:row_nr+nr_queries, 2:])
            
            means = np.mean(relevant_stats, axis=0)
            score_means[ii, :] = means
            
            rest_of_line = ','.join(map(str, [round(x, 3) for x in means]))
            line += rest_of_line + '\n'
            f.write(line)
    return score_means
                
score_means = print_result_df(result_df, nr_systems, nr_queries)

In [37]:
score_means

array([[0.39      , 0.83369048, 0.40107143, 0.40021017, 0.36294495,
        0.48517686],
       [0.22      , 0.86702381, 0.2525    , 0.30035249, 0.19988113,
        0.24624795],
       [0.41      , 0.76702381, 0.44845238, 0.45117451, 0.42034843,
        0.51091849],
       [0.08      , 0.18940476, 0.04928571, 0.07456483, 0.06870844,
        0.07554637],
       [0.41      , 0.76702381, 0.35761905, 0.3638288 , 0.33245629,
        0.4242865 ],
       [0.41      , 0.76702381, 0.44845238, 0.44457729, 0.40003119,
        0.49053186]])

In [38]:
col_names = ['P@10', 'R@50', 'r-precision', 'AP', 'nDCG@10', 'nDCG@20']
def p_values(result_df:pd.DataFrame, score_means:np.ndarray, nr_queries:int, col_names:List[str]):
    nr_systems = score_means.shape[0]
    for ii, stat in enumerate(col_names):
        # For the current stat, need to select the top 2 means.
        means = score_means[:, ii]
        
        sorted_idx = np.argsort(means)
        first_idx = sorted_idx[-1]
        second_idx = sorted_idx[-2]
        last = -2
        
        first_scores = np.array(result_df[stat][nr_queries*first_idx : nr_queries*first_idx + nr_queries])
        second_scores = np.array(result_df[stat][nr_queries*second_idx : nr_queries*second_idx + nr_queries])
        diff = first_scores - second_scores
        
        while np.std(diff) == 0 and last >= -nr_systems:
            last -= 1
            print("System : " + str(second_idx+1) + " has identical performance with the first system.")
            second_idx = sorted_idx[last]
            second_scores = np.array(result_df[stat][nr_queries*second_idx : nr_queries*second_idx + nr_queries])
            diff = first_scores - second_scores
        
        t_value, p_value = ttest_1samp(diff, 0)
        
        print('For statistic ' + stat + ': best = ' + str(first_idx+1) + '; 2nd best = ' + 
             str(second_idx+1) + '; P-value: ' + str(p_value) + '; T-value: ' + str(t_value))
p_values(result_df, score_means, nr_queries, col_names)

System : 5 has identical performance with the first system.
System : 3 has identical performance with the first system.
For statistic P@10: best = 6; 2nd best = 1; P-value: 0.7509058687700358; T-value: 0.32732683535398877
For statistic R@50: best = 2; 2nd best = 1; P-value: 0.34343639613791355; T-value: 1.0
System : 3 has identical performance with the first system.
For statistic r-precision: best = 6; 2nd best = 1; P-value: 0.5914724124414916; T-value: 0.5564431782018991
For statistic AP: best = 3; 2nd best = 6; P-value: 0.6716502377784637; T-value: 0.4380858271151794
For statistic nDCG@10: best = 3; 2nd best = 6; P-value: 0.26941487087489624; T-value: 1.1769197447775657
For statistic nDCG@20: best = 3; 2nd best = 6; P-value: 0.246105219013315; T-value: 1.2406391425324226
