In [2]:
import pandas as pd
import numpy as np

In [3]:
def read_answers(year, topic_no):
    
    '''
    Read the ground truth of a topic in a certain year
    Args: year, topic
    Returns: the answer for a certain topic, <docid, rel>
    '''
    
    answer = pd.read_csv("/Users/jiamingqu/Desktop/proj/data/topics/"+str(year)+"qrel.txt", header = None, sep=" ")
    answer.columns = ["topic","q0","docid","rel"]
    answer["docid"] = answer["docid"].astype(str)
    answer_by_topic = answer[answer.topic==topic_no]
    return dict(zip(answer_by_topic.docid, answer_by_topic.rel))

In [4]:
def calculate_precision (answer, prediction, k):
    
    '''
    Calculate Precision@K
    Input: answer - a dictionary of <docid, relevance> pairs
           prediction - a *sorted* list of docid ordered by ranking scores
           k: an integer
    Output: Precision@K
    '''
    
    # top k
    prediction_k = prediction[:k]
    count = 0
    for i in range(0, k):
        doc_id = prediction_k[i]
        if doc_id in answer.keys():
            if (answer[doc_id] == 1) or (answer[doc_id] == 2):
                count += 1
            else:
                continue
        else:
            continue
    return count/k

In [5]:
def calculate_recall (answer, prediction, k):
    
    '''
    Calculate Recall@K
    Input: answer - a dictionary of <docid, relevance> pairs
           prediction - a *sorted* list of docid ordered by ranking scores
           k: an integer
    Output: Recall@K
    '''
    
    # top k
    prediction_k = prediction[:k]
    count = 0
    for i in range(0, k):
        doc_id = prediction_k[i]
        if doc_id in answer.keys():
            if (answer[doc_id] == 1) or (answer[doc_id] == 2):
                count += 1
            else:
                continue
        else:
            continue 
    total_rel_doc = len({ k:v for k,v in answer.items() if v != 0})
    return count/total_rel_doc

In [6]:
def calculate_r_precision (answer, prediction):
    '''
    Calculate R-Precision
    Input: answer - a dictionary of <docid, relevance> pairs
           prediction - a *sorted* list of docid ordered by ranking scores
    Output: R-Precision
    '''
    
    total_rel_doc = len({ k:v for k,v in answer.items() if v != 0})
    count = 0
    for i in range(0, total_rel_doc):
        doc_id = prediction[i]
        if doc_id in answer.keys():
            if (answer[doc_id] == 1) or (answer[doc_id] == 2):
                count += 1
            else:
                continue
        else:
            continue
    return count/total_rel_doc

In [7]:
def calculate_average_precision (answer, prediction):
    
    '''
    Calculate Average Precision for a query
    Input: answer - a dictionary of <docid, relevance> pairs per query
           prediction - a *sorted* list of docid ordered by ranking scores
    Output: Average Precision
    '''
    
    list_precision = []
    rel_docs = []
    
    total_rel_doc = len({ k:v for k,v in answer.items() if v != 0})
    
    # iterate over the ranking
    for i in range(0, len(prediction)):
        doc_id = prediction[i]
        if doc_id in answer.keys():
            if (answer[doc_id] == 1) or (answer[doc_id] == 2):
                # append to the result
                rel_docs.append(doc_id)
                # precision at this position
                precision = float(len(rel_docs) / (i + 1))
                list_precision.append(precision)   
            # not relevant
            else:
                continue 
        # not judged        
        else:
            continue
            
    return float(np.sum(list_precision) / total_rel_doc)