In [1]:
!pip3 install sklearn



In [2]:
# import required libraries
import sys
sys.path.insert(0, '../../../BERT-FAQ/')

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import ndcg_score
from shared.utils import load_from_json
from metric import NDCG
from tqdm import tqdm
import pandas as pd
import numpy as np
import os.path
import math

In [3]:
# define functions to compute NDCG@, P@, MAP 

def compute_map(result_filepath):
    query_results = load_from_json(result_filepath)

    sum_ap = 0
    num_queries = 0
    for result in query_results:
        query_string = result['query_string']
        topk_results = result['rerank_preds']

        labels = []
        reranks = []

        for topk in topk_results:
            labels.append(topk['label'])
            reranks.append(topk['score'])

        true_relevance = np.array(labels)
        scores = np.array(reranks)

        ap = 0
        all_zeros = not np.any(labels)
        if labels and reranks and not all_zeros:
            ap = average_precision_score(true_relevance, scores)

        sum_ap = sum_ap + ap
        num_queries = num_queries + 1

    return float(sum_ap / num_queries)

def compute_prec(result_filepath, k):
    query_results = load_from_json(result_filepath)

    sum_prec = 0
    num_queries = 0
    for result in query_results:
        query_string = result['query_string']
        topk_results = result['rerank_preds']

        labels = []
        reranks = []

        for topk in topk_results[:k]:
            labels.append(topk['label'])
            reranks.append(topk['score'])

        true_relevance = np.array(labels)
        scores = np.array(reranks)

        prec = 0
        all_zeros = not np.any(labels)
        if labels and reranks and not all_zeros:
            prec = sum(true_relevance) / len(true_relevance)

        sum_prec = sum_prec + prec
        num_queries = num_queries + 1

    return float(sum_prec / num_queries)

def compute_ndcg(result_filepath, k):
    query_results = load_from_json(result_filepath)

    sum_ndcg = 0
    num_queries = 0
    for result in query_results:
        query_string = result['query_string']
        topk_results = result['rerank_preds']

        labels = []
        reranks = []

        for topk in topk_results[:k]:
            labels.append(topk['label'])
            reranks.append(topk['score'])

        true_relevance = np.asarray([labels])
        scores = np.asarray([reranks])

        ndcg = 0
        if labels and reranks:
            ndcg = ndcg_score(true_relevance, scores)

        sum_ndcg = sum_ndcg + ndcg
        num_queries = num_queries + 1

    return float(sum_ndcg / num_queries)

In [4]:
# define function to generate evaluation DataFrame

def get_evaluation_df(rank_results_filepath):
    output = dict()
    metric_name = []

    i = 0

    rankers = ["unsupervised", "supervised"] 
    loss_type = ["triplet", "softmax"]
    query_types = ["faq", "user_query"]
    neg_types = ["simple", "hard"]

    topk = [3, 5, 10]

    for ranker in rankers:

        # Compute metrics for the unsupervised method
        if ranker == "unsupervised":
            file_path = rank_results_filepath + "/" + ranker

            method = ranker

            answer_metric = []
            question_metric = []
            question_answer_metric = []
            question_answer_concat_metric = []

            i += 1
            for k in topk:
                # compute NDCG@k
                if i == 1:
                    metric_name.append("ndcg@" + str(k))

                answer_metric.append(compute_ndcg(file_path + "/es_query_by_answer.json", k))
                question_metric.append(compute_ndcg(file_path + "/es_query_by_question.json", k))
                question_answer_metric.append(compute_ndcg(file_path + "/es_query_by_question_answer.json", k))
                question_answer_concat_metric.append(compute_ndcg(file_path + "/es_query_by_question_answer_concat.json", k))

                # compute P@k
                if i == 1:
                    metric_name.append("prec@" + str(k))

                answer_metric.append(compute_prec(file_path + "/es_query_by_answer.json", k))
                question_metric.append(compute_prec(file_path + "/es_query_by_question.json", k))
                question_answer_metric.append(compute_prec(file_path + "/es_query_by_question_answer.json", k))
                question_answer_concat_metric.append(compute_prec(file_path + "/es_query_by_question_answer_concat.json", k))

            # compute MAP
            if i == 1:
                metric_name.append("MAP" + str(k))

            answer_metric.append(compute_map(file_path + "/es_query_by_answer.json"))
            question_metric.append(compute_map(file_path + "/es_query_by_question.json"))
            question_answer_metric.append(compute_map(file_path + "/es_query_by_question_answer.json"))
            question_answer_concat_metric.append(compute_map(file_path + "/es_query_by_question_answer_concat.json"))
            

            output[method + "_answer"] = answer_metric
            output[method + "_question"] = question_metric
            output[method + "_question_answer"] = question_answer_metric
            output[method + "_question_answer_concat"] = question_answer_concat_metric

        else:
        # Compute metrics for the supervised method
            for loss in loss_type:
                for query_type in query_types:
                    
                    for neg_type in neg_types:
                        
                        file_path = rank_results_filepath + "/" + ranker + "/" + loss + "/" + query_type + "/" + neg_type
                        
                        if os.path.isdir(file_path):

                            method = loss + "_" + query_type + "_" + neg_type

                            answer_metric = []
                            question_metric = []
                            question_answer_metric = []
                            question_answer_concat_metric = []

                            for k in topk:

                                # compute NDCG@k
                                answer_metric.append(compute_ndcg(file_path + "/reranked_query_by_answer.json", k))
                                question_metric.append(compute_ndcg(file_path + "/reranked_query_by_question.json", k))
                                question_answer_metric.append(compute_ndcg(file_path + "/reranked_query_by_question_answer.json", k))
                                question_answer_concat_metric.append(compute_ndcg(file_path + "/reranked_query_by_question_answer_concat.json", k))

                                # compute P@k
                                answer_metric.append(compute_prec(file_path + "/reranked_query_by_answer.json", k))
                                question_metric.append(compute_prec(file_path + "/reranked_query_by_question.json", k))
                                question_answer_metric.append(compute_prec(file_path + "/reranked_query_by_question_answer.json", k))
                                question_answer_concat_metric.append(compute_prec(file_path + "/reranked_query_by_question_answer_concat.json", k))

                            # compute MAP
                            answer_metric.append(compute_map(file_path + "/reranked_query_by_answer.json"))
                            question_metric.append(compute_map(file_path + "/reranked_query_by_question.json"))
                            question_answer_metric.append(compute_map(file_path + "/reranked_query_by_question_answer.json"))
                            question_answer_concat_metric.append(compute_map(file_path + "/reranked_query_by_question_answer_concat.json"))
                            

                            output[method + "_answer"] = answer_metric
                            output[method + "_question"] = question_metric
                            output[method + "_question_answer"] = question_answer_metric
                            output[method + "_question_answer_concat"] = question_answer_concat_metric
                        else:
                            pass
                        
    data = dict()
    methods = []
    for method in output:
        metrics = output[method]
        metrics = ["{0:.4f}".format(x) for x in metrics]
        data[method] = metrics
        methods.append(method)

    # Generate evaluation DataFrame 
    df = pd.DataFrame.from_dict(data, orient='index')
    df.columns = ["NDCG@3", "NDCG@5","NDCG@10", "P@3",	"P@5", "P@10", "MAP"]
    df['Method'] = methods
    df = df[["Method", "NDCG@3", "NDCG@5","NDCG@10", "P@3",	"P@5", "P@10", "MAP"]]
    df.reset_index(drop=True, inplace=True)
    return df

In [5]:
# generate evaluation dataframe
rank_results_filepath="../../../BERT-FAQ/data/FAQIR/rank_results"
df = get_evaluation_df(rank_results_filepath)

In [6]:
df

Unnamed: 0,Method,NDCG@3,NDCG@5,NDCG@10,P@3,P@5,P@10,MAP
0,unsupervised_answer,0.2667,0.1431,0.2905,0.1212,0.3069,0.0916,0.1884
1,unsupervised_question,0.3798,0.206,0.4045,0.1726,0.4258,0.1303,0.2781
2,unsupervised_question_answer,0.3761,0.2029,0.4008,0.1708,0.4225,0.1327,0.2664
3,unsupervised_question_answer_concat,0.4318,0.2482,0.4573,0.213,0.4783,0.1606,0.3106
4,triplet_faq_simple_answer,0.4501,0.2787,0.4724,0.2455,0.4915,0.1801,0.3909
5,triplet_faq_simple_question,0.5409,0.3347,0.5643,0.2872,0.5774,0.2198,0.4571
6,triplet_faq_simple_question_answer,0.5378,0.335,0.5642,0.2921,0.5811,0.2286,0.4561
7,triplet_faq_simple_question_answer_concat,0.574,0.3652,0.5938,0.3167,0.6095,0.2474,0.4724
8,triplet_faq_hard_answer,0.3991,0.2291,0.4156,0.1858,0.4386,0.1368,0.3202
9,triplet_faq_hard_question,0.4889,0.2818,0.5145,0.2294,0.5335,0.168,0.3788


In [7]:
# dump StackFAQ results to csv file
df.to_csv(rank_results_filepath + "/results.csv", index=False)