In [55]:
%run /Users/jiamingqu/Desktop/proj/scripts/searching/evaluation.functions.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/data.modeling/training.tree.model.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/reranking/significance.testing.ipynb

In [56]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [57]:
def aggregate_prob(prob):
    
    '''
    Aggregate the predicted probabilities
    Input: An array/list of probabilities of three classes
    Output: Aggregated probability
    '''
    
    # assign weights to three classes
    weights = [0, 2, 4]
    
    probability = 0
    for i in range(0,3):
        probability += prob[0][i] * weights[i]
        
    return probability

In [122]:
def reranking_retrieval_results_hard(year,ndocs_rerank=500):
    
    
    # read predited prob.
    df_features_high = pd.read_csv(str(year)+'.high.features.csv',sep=',')
    df_features_high['topicid'] = df_features_high['topicid'].astype(int)
    df_features_high['docid'] = df_features_high['docid'].astype(str)

    tree_model = joblib.load("../data.modeling/"+str(year)+".tree.model.balanced.pkl") 
    
                                   
    # features 
    feature_names = ['Human_PM', 'Animal_PM', 'Not_PM', 
                     'Disease_Exact','Disease_General', 'Disease_Specific', 'Disease_Not', 
                     'Gene_Exact','Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
                     'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude']          
                                   
    # record results
    precision = []
    ap = []
    
    for topic in [*range(1,51)]:
        
        # read ground truth
        answer = read_answers(year,topic)

        # 500 results under a topic
        df_topic = df_features_high.loc[df_features_high.topicid == topic]
        df_topic = df_topic.head(500)

        # we only rerank the first n results
        df_topic_reanking = df_topic.head(ndocs_rerank)
        df_topic_noreanking = df_topic.tail(500-ndocs_rerank)

        # a dictionary of <docid, predicted_prob>
        doc_prob = dict()
        for index,rows in df_topic_reanking.iterrows():

            docid = str(df_topic_reanking.loc[index,"docid"])
            features = df_topic_reanking.loc[index, feature_names]

            # predict and append to the dict
            doc_prob[docid] = aggregate_prob(tree_model.predict_proba([features]))

        sorted_results = sorted(doc_prob.items(), key=lambda x:x[1], reverse=True)
        assert len(sorted_results) == ndocs_rerank
        sorted_docs = [x[0] for x in sorted_results]

        # add not reanked docs
        sorted_docs.extend(list(df_topic_noreanking.docid))

        precision_topic = calculate_precision(answer,sorted_docs,10)
        ap_topic = calculate_average_precision(answer,sorted_docs)
        precision.append(precision_topic)
        ap.append(ap_topic)

    return [precision,ap]

In [145]:
def reranking_retrieval_results_soft(year,ndocs_rerank=500):
    
    
    # read predited prob.
    df_features_high = pd.read_csv(str(year)+'.high.features.csv',sep=',')
    df_features_high['topicid'] = df_features_high['topicid'].astype(int)
    df_features_high['docid'] = df_features_high['docid'].astype(str)
    
    # features 
    feature_names = ['Human_PM', 'Animal_PM', 'Not_PM', 
                     'Disease_Exact','Disease_General', 'Disease_Specific', 'Disease_Not', 
                     'Gene_Exact','Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
                     'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude']          
                                   
    # record results
    precision = []
    ap = []
    

    for topic in [*range(1,51)]:
        
        # read ground truth
        answer = read_answers(year,topic)

        # 500 results under a topic
        df_topic = df_features_high.loc[df_features_high.topicid == topic]
        df_topic = df_topic.head(500)

        # we only rerank the first n results
        df_topic_reanking = df_topic.head(ndocs_rerank)
        df_topic_noreanking = df_topic.tail(500-ndocs_rerank)

        # a dictionary of <docid, predicted_prob>
        doc_prob = dict()
        for index,rows in df_topic_reanking.iterrows():

            docid = str(df_topic_reanking.loc[index,"docid"])
            features = df_topic_reanking.loc[index, feature_names]
            
            results = probalistic_tree_balanced(dict(features))
            doc_prob[docid] = round(aggregate_prob([results]), 3)

        sorted_results = sorted(doc_prob.items(), key=lambda x:x[1], reverse=True)
        assert len(sorted_results) == ndocs_rerank
        sorted_docs = [x[0] for x in sorted_results]

        # add not reanked docs
        sorted_docs.extend(list(df_topic_noreanking.docid))

        precision_topic = calculate_precision(answer,sorted_docs,10)
        ap_topic = calculate_average_precision(answer,sorted_docs)
        precision.append(precision_topic)
        ap.append(ap_topic)

    return [precision,ap]

In [124]:
# reranking without bm25 scores

In [125]:
precision_tree_hard = reranking_retrieval_results_hard(2018)[0]
ap_tree_hard = reranking_retrieval_results_hard(2018)[1]
print(np.mean(precision_tree_hard), np.mean(ap_tree_hard))

0.574 0.24145502353628337


In [146]:
precision_tree_soft = reranking_retrieval_results_soft(2018)[0]
ap_tree_soft = reranking_retrieval_results_soft(2018)[1]
print(np.mean(precision_tree_soft), np.mean(ap_tree_soft))

0.562 0.23146851008452984


In [186]:
# reranking with bm25 scores

In [173]:
def reranking_retrieval_results_hard_withbm25(year,ndocs_rerank=500):
    
    # read predited prob.
    df_features_high = pd.read_csv(str(year)+'.high.features.csv',sep=',')
    df_features_high['topicid'] = df_features_high['topicid'].astype(int)
    df_features_high['docid'] = df_features_high['docid'].astype(str)
    
    initial_retrieval = pd.read_csv("../searching/"+str(year)+".searching/"+str(year)+".basic.query.result.txt",sep="\t")
    df_features_high['score'] = initial_retrieval['SCORE']
    
    # read tree model
    tree_model = joblib.load("../data.modeling/"+str(year)+".tree.model.balanced.pkl")
    
                                   
    # features 
    feature_names = ['Human_PM', 'Animal_PM', 'Not_PM', 
                     'Disease_Exact','Disease_General', 'Disease_Specific', 'Disease_Not', 
                     'Gene_Exact','Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
                     'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude']          
                                   
    # record results
    precision = []
    ap = []
    recall = []
    rprec = []
    
    for topic in set(df_features_high.topicid):
        
        # read ground truth
        answer = read_answers(year,topic)

        # 500 results under a topic
        df_topic = df_features_high.loc[df_features_high.topicid == topic]
        df_topic = df_topic.head(500)
   
        # we only rerank the first n results
        df_topic_reanking = df_topic.head(ndocs_rerank)
        df_topic_noreanking = df_topic.tail(500-ndocs_rerank)
        
        df_score_reranking = pd.DataFrame(columns = ['docid','tree_prob', 'bm25'])

        # for reranking docs
        for index,rows in df_topic_reanking.iterrows():
            
            docid = str(df_topic_reanking.loc[index,"docid"])
            
            # load intial retrieval scores
            bm25_score = df_topic_reanking.loc[index,"score"]
            
            # using features to predict
            features = df_topic_reanking.loc[index, feature_names]
            tree_score = aggregate_prob(tree_model.predict_proba([features]))

            # add to dataframe
            df_score_reranking = df_score_reranking.append({'docid': docid,
                                                            'tree_prob': tree_score, 
                                                            'bm25': bm25_score},ignore_index=True)
            
        assert df_score_reranking.shape[0] == ndocs_rerank 

        # for unrerank docs
        for index,rows in df_topic_noreanking.iterrows():
            
            docid = str(df_topic_noreanking.loc[index,"docid"])
            # load intial retrieval scores
            bm25_score = df_topic_noreanking.loc[index,"score"]
            # add to dataframe
            df_score_reranking = df_score_reranking.append({'docid': docid,
                                                            'tree_prob': 0, 
                                                            'bm25': bm25_score},ignore_index=True)
            
        assert df_score_reranking.shape[0] == 500
        
        # scaling and adding two score
        scaler = MinMaxScaler()
        #df_score_reranking[['tree_prob','bm25']]=scaler.fit_transform(df_score_reranking[['tree_prob','bm25']])
        df_score_reranking['total_score'] = df_score_reranking['tree_prob'] + df_score_reranking['bm25']
        df_score_reranking.sort_values(by=['total_score'],inplace=True, ascending=False)
        sorted_docs = list(df_score_reranking.docid)

        precision.append(calculate_precision(answer,sorted_docs,10))
        ap.append(calculate_average_precision(answer,sorted_docs))
        recall.append(calculate_recall(answer,sorted_docs,100))
        rprec.append(calculate_r_precision(answer,sorted_docs))

    return [precision, ap, recall, rprec]

In [174]:
def reranking_retrieval_results_soft_withbm25(year,ndocs_rerank=500):
    
    # read predited prob.
    df_features_high = pd.read_csv(str(year)+'.high.features.csv',sep=',')
    df_features_high['topicid'] = df_features_high['topicid'].astype(int)
    df_features_high['docid'] = df_features_high['docid'].astype(str)
    
    initial_retrieval = pd.read_csv("../searching/"+str(year)+".searching/"+str(year)+".basic.query.result.txt",sep="\t")
    df_features_high['score'] = initial_retrieval['SCORE']
                                   
    # features 
    feature_names = ['Human_PM', 'Animal_PM', 'Not_PM', 
                     'Disease_Exact','Disease_General', 'Disease_Specific', 'Disease_Not', 
                     'Gene_Exact','Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
                     'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude']          
                                   
    # record results
    # record results
    precision = []
    ap = []
    recall = []
    rprec = []
    
    for topic in set(df_features_high.topicid):
        
        # read ground truth
        answer = read_answers(year,topic)

        # 500 results under a topic
        df_topic = df_features_high.loc[df_features_high.topicid == topic]
        df_topic = df_topic.head(500)
        
        # we only rerank the first n results
        df_topic_reanking = df_topic.head(ndocs_rerank)
        df_topic_noreanking = df_topic.tail(500-ndocs_rerank)
        
        df_score_reranking = pd.DataFrame(columns = ['docid','tree_prob', 'bm25'])

        # for reranking docs
        for index,rows in df_topic_reanking.iterrows():
            
            docid = str(df_topic_reanking.loc[index,"docid"])
            
            # load intial retrieval scores
            bm25_score = df_topic_reanking.loc[index,"score"]
            
            # using features to predict
            features = df_topic_reanking.loc[index, feature_names]
            results = probalistic_tree_balanced(dict(features))
            tree_score = aggregate_prob_soft([results])

            # add to dataframe
            df_score_reranking = df_score_reranking.append({'docid': docid,
                                                            'tree_prob': tree_score, 
                                                            'bm25': bm25_score},ignore_index=True)
            
        assert df_score_reranking.shape[0] == ndocs_rerank 

        # for unrerank docs
        for index,rows in df_topic_noreanking.iterrows():
            
            docid = str(df_topic_noreanking.loc[index,"docid"])
            # load intial retrieval scores
            bm25_score = df_topic_noreanking.loc[index,"score"]
            # add to dataframe
            df_score_reranking = df_score_reranking.append({'docid': docid,
                                                            'tree_prob': 0, 
                                                            'bm25': bm25_score},ignore_index=True)
            
        assert df_score_reranking.shape[0] == 500
        
        # scaling and adding two score
        scaler = MinMaxScaler()
        df_score_reranking[['tree_prob','bm25']]=scaler.fit_transform(df_score_reranking[['tree_prob','bm25']])
        df_score_reranking['total_score'] = df_score_reranking['tree_prob'] + df_score_reranking['bm25']
        df_score_reranking.sort_values(by=['total_score'],inplace=True, ascending=False)
        sorted_docs = list(df_score_reranking.docid)

        precision.append(calculate_precision(answer,sorted_docs,10))
        ap.append(calculate_average_precision(answer,sorted_docs))
        recall.append(calculate_recall(answer,sorted_docs,100))
        rprec.append(calculate_r_precision(answer,sorted_docs))

    return [precision, ap, recall, rprec]

In [175]:
# add bm25 by min max scaling

In [176]:
precision_tree_hard_bm25, ap_tree_hard_bm25, recall_tree_hard_bm25, rprec_tree_hard_bm25 = \
    reranking_retrieval_results_hard_withbm25(2018)

In [177]:
precision_tree_soft_bm25, ap_tree_soft_bm25, recall_tree_soft_bm25, rprec_tree_soft_bm25 = \
    reranking_retrieval_results_soft_withbm25(2018)

In [178]:
np.mean(precision_tree_hard_bm25), np.mean(ap_tree_hard_bm25), np.mean(recall_tree_hard_bm25), np.mean(rprec_tree_hard_bm25)

(0.546, 0.2377849187143844, 0.3344221502141547, 0.3232313153621363)

In [179]:
np.mean(precision_tree_soft_bm25), np.mean(ap_tree_soft_bm25), np.mean(recall_tree_soft_bm25), np.mean(rprec_tree_soft_bm25)

(0.622, 0.260482763795428, 0.3600719402556245, 0.34631718256635335)

In [180]:
# significance testing

In [187]:
# hard vs soft

In [159]:
paired_permutation_test(precision_tree_hard_bm25, precision_tree_soft_bm25, isTwoSides=True)

0.00058

In [160]:
paired_permutation_test(ap_tree_hard_bm25, ap_tree_soft_bm25, isTwoSides=True)

1e-05

In [181]:
paired_permutation_test(rprec_tree_hard_bm25, rprec_tree_soft_bm25, isTwoSides=True)

5e-05

In [188]:
# ltr-low vs soft

In [163]:
paired_permutation_test(precision_ltr_low, ap_tree_soft_bm25, isTwoSides=True)

0.0

In [164]:
paired_permutation_test(ap_ltr_low, ap_tree_soft_bm25, isTwoSides=True)

2e-05

In [183]:
paired_permutation_test(rprec_ltr_low, rprec_tree_soft_bm25, isTwoSides=True)

0.00559