In [None]:
# retrieve the data
!git clone https://github.com/iai-group/DBpedia-Entity.git

In [2]:
# imports
import time
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor

In [3]:
qrels = pd.read_csv('DBpedia-Entity/collection/v2/qrels-v2.txt', sep='\t',names = ['query_id', 'Q', 'tag', 'rel'])
bm25f_run = pd.read_csv('DBpedia-Entity/runs/v2/bm25f-ca_v2.run', sep='\s',header=None, names=['query_id', 'Q', 'tag', 'rank', 'scoring', 'algorithm'])
fsdm_run = pd.read_csv('DBpedia-Entity/runs/v2/fsdm-elr_v2.run', sep='\s',header=None, names=['query_id', 'Q', 'tag', 'rank', 'scoring', 'algorithm'])

print(qrels.shape, bm25f_run.shape, fsdm_run.shape)

  
  This is separate from the ipykernel package so we can avoid doing imports until
(49280, 4) (466022, 6) (466022, 6)


In [12]:
def process_ranks(q_id):
    """ 
    Extract the rankings of FSDM+ELR and BM25F-CA and 
    correspond the relevancy labels for a given query id
    """
    fsdm_qid = fsdm_run[fsdm_run['query_id']==q_id]
    bm25f_qid = bm25f_run[bm25f_run['query_id']==q_id]
    fsdm_bm25f = fsdm_qid.merge(bm25f_qid, how='outer', on='tag')
    fsdm_bm25f.rename(columns={'rank_x':'fsdm_rank', 'rank_y':'bm25f_rank'}, inplace=True)
    fsdm_bm25f['fsdm_rank'] = np.where(fsdm_bm25f['fsdm_rank'].isna(), 1001, fsdm_bm25f['fsdm_rank'])
    fsdm_bm25f['bm25f_rank'] = np.where(fsdm_bm25f['bm25f_rank'].isna(), 1001, fsdm_bm25f['bm25f_rank'])
    fsdm_bm25f = fsdm_bm25f[['tag', 'fsdm_rank', 'bm25f_rank']]
    fsdm_bm25f = fsdm_bm25f.merge(qrels[qrels['query_id']==q_id][['rel','tag']], on='tag', how='left')
    fsdm_bm25f['rel'] = np.where(fsdm_bm25f['rel'].isna(), 0, fsdm_bm25f['rel'])
    X = fsdm_bm25f[['fsdm_rank', 'bm25f_rank']]
    y = fsdm_bm25f['rel']
    return X, y

In [13]:
def precision(query_relevancy_labels, k):
    return (np.sum(query_relevancy_labels[:k]))/k
    
def DCG(query_relevancy_labels, k):
    dcg = 0
    for i in range(min(len(query_relevancy_labels), k)):
      dcg+= query_relevancy_labels[i]/np.log2(i+2)
    return dcg

def NDCG(query_relevancy_labels, k):
    if sum(query_relevancy_labels)==0:   
      return 0
    return DCG(query_relevancy_labels, k) / DCG(np.sort(query_relevancy_labels)[::-1][:k], len(np.sort(query_relevancy_labels)[::-1][:k]))

def evaluate(query_relevancy_labels):
    precisionat10 = precision(query_relevancy_labels,10)
    precisionat100 = precision(query_relevancy_labels,100)
    ndcgat10 = NDCG(query_relevancy_labels,10)
    ndcgat100 = NDCG(query_relevancy_labels,100)
    return precisionat10, precisionat100, ndcgat10, ndcgat100

In [22]:
def pipeline_per_query(query_type):
    """
    Trains and evaluates a Random Forest Regressor for a given query type
    """
    splits_per_fold = pd.read_json('DBpedia-Entity/collection/v2/folds/'+query_type+'.json')
    results_per_fold = []
    for fold in range(5):
        train_qids = splits_per_fold[fold]['training']
        test_qids = splits_per_fold[fold]['testing']

        ################ MODEL TRAINING ################
        models = []
        start_time = time.time()
        for q_id in train_qids:
            X_train, y_train = process_ranks(q_id)

            rf = RandomForestRegressor()
            rf.fit(X_train,y_train)
            models.append(rf)
        print("Training completed in %s seconds"%(time.time()-start_time))

        ################ MODEL TESTING ################
        start_time = time.time()
        scores_per_qid = []
        for query in test_qids:
            for m, model in enumerate(models):
                X_test, y_test = process_ranks(query)
                y_pred = model.predict(X_test)
                y_pred_sorted = np.argsort(y_pred)[::-1]
                query_relevancy_labels = y_test[y_pred_sorted]
                scores_per_qid.append(evaluate(query_relevancy_labels))
        precision10 = sum([x[0] for x in scores_per_qid]) / len(scores_per_qid)
        precision100 = sum([x[1] for x in scores_per_qid]) / len(scores_per_qid)
        ndcg10 = sum([x[2] for x in scores_per_qid]) / len(scores_per_qid)
        ndcg100 = sum([x[3] for x in scores_per_qid]) / len(scores_per_qid)
        results_per_fold.append([precision10, precision100, ndcg10, ndcg100])
        print("Evaluation of fold {} took {} seconds".format(fold, time.time()-start_time))

    print(query_type)
    precision10 = sum([x[0] for x in results_per_fold]) / len(results_per_fold)
    print("Averaged Precision@10", precision10)
    precision100 = sum([x[1] for x in results_per_fold]) / len(results_per_fold)
    print("Averaged Precision@100", precision100)
    ndcg10 = sum([x[2] for x in results_per_fold]) / len(results_per_fold)
    print("Averaged nDCG@10", ndcg10)
    ndcg100 = sum([x[3] for x in results_per_fold]) / len(results_per_fold)
    print("Averaged nDCG@100", ndcg100)

    with open(query_type+'_results.txt', 'w') as f:
        f.write("P@10"+str(precision10)+"\n P@100"+ str(precision100)+ "\n nDCG@10"+ str(ndcg10)+ "\n nDCG@100"+ str(ndcg100))

query_types = ['SemSearch-ES', 'INEX-LD', 'QALD2', 'ListSearch', 'all_queries']
for q_type in query_types:
    pipeline_per_query(q_type) 

Training completed; that took 18.059690713882446 seconds
Evaluation of fold 0 took 142.4795274734497 seconds
Training completed; that took 16.57797145843506 seconds
Evaluation of fold 1 took 141.42291975021362 seconds
Training completed; that took 16.594184637069702 seconds
Evaluation of fold 2 took 140.486074924469 seconds
Training completed; that took 16.708476543426514 seconds
Evaluation of fold 3 took 140.50203657150269 seconds
Training completed; that took 16.892897844314575 seconds
Evaluation of fold 4 took 140.60868191719055 seconds
ListSearch
Averaged Precision@10 0.468204158790171
Averaged Precision@100 0.18843100189035916
Averaged nDCG@10 0.43674963988337384
Averaged nDCG@100 0.5341811804457615


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=882fec53-9a12-496b-9dec-6a7521f3b721' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>