## Libraries & Initiate PyTerrier 

In [None]:
!pip install python-terrier
from google.colab import drive
from collections import defaultdict
from pathlib import Path
import pandas as pd
import glob, os
import gensim
import os
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
import pyterrier as pt
from pyterrier.measures import *
from bs4 import BeautifulSoup
import warnings
import regex as re
from pyterrier.measures import *
import numpy as np

if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

drive.mount("/content/gdrive", force_remount=True)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_columns', None)


In [None]:
path_to_index = ''
path_to_query_variation = ''
path_to_original_topics = ''

#  Load TREC2021 Data (.csv), Queries (Raw Description), Qrels

In [None]:
#TREC Dataset
collection = pd.read_csv(path_to_index, index_col=0)
collection = collection.fillna(' ')
collection = collection.rename(columns={"doc_id": "docno"})
display(collection.head(2))

#Qrels
qrels = pd.read_csv('./qrels/trec_clc/qrels2021.txt', names=['qid','Q0','docno','label'],sep=" ",header=None)
qrels = qrels.drop(columns=['Q0'])
qrels["qid"] = qrels["qid"].astype(str)
qrels["docno"] = qrels["docno"].astype(str)
display(qrels.head(2))

# Load topics

## Raw Description

In [None]:
#Load the Raw Queries
with open(path_to_original_topics, 'r', encoding='utf-8',
                 errors='ignore') as document:
  d = document.readlines()
  count = 0
  # Strips the newline character
  ld = []
  lid = []
  for line in d:
    count += 1
    query = re.findall(r'^<topic number=\"\d+\">(.*)</topic>$',line,re.DOTALL)[0]
    ld.append(query)
    lid.append(count)
  desc = pd.DataFrame({'qid':lid,'query': ld})

#Query processing. Removal of punctuation
desc["query"] = desc["query"].apply(lambda x: x.lower())
desc["query"] = desc["query"].apply(lambda x: strip_punctuation(x))
desc["qid"] = desc["qid"].astype(str)
display(desc.head(5))

## Load the pre-processed queries

In [None]:
#For topics that no emdical entities exist.
def handle_empty_queries(query):
  query = str(query)
  if len(query)>=5:
    return query
  else: 
    return 'None'

def read_processed(path):
  processed_topics = pd.read_csv(path, names=['qid','query'],sep=",",header=0)
  processed_topics["qid"] = processed_topics["qid"].astype(str)
  processed_topics['query'] = processed_topics["query"].apply(lambda x: handle_empty_queries(x))
  processed_topics["query"] = processed_topics["query"].astype(str)
  return processed_topics


# Retrieval Experiments

Query formulation:
> 1.   Description Queries 
> 2.   Selected query variation

Retrieval Models: 
> 1.   BM25

Document Representation:
> 1.   Indexed all the available sections

In [None]:
# #Initialize the retrieval model: Set the properties
bm25 = pt.BatchRetrieve(indexall, wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}) #properties={"termpipelines" : "Stopwords,PorterStemmer"}

## Retrieval with the various query Variations
initial_res = bm25.transform(desc)
query_version1 = bm25.transform(path_to_query_variation)

#Evaluate
results_all_des = pt.Experiment(
    [initial_res
    ,query_version1
     ],
    desc,
    qrels,
    eval_metrics=[AP(rel=2)@1000,RR(rel=2)@1000,P(rel=2)@1,P(rel=2)@5,P(rel=2)@10,P(rel=2)@25,Rprec(rel=2),nDCG@5,nDCG@10,R(rel=2)@10,R(rel=2)@25,NumRet,NumRelRet(rel=2),NumRel,Bpref(rel=2)],
    names=["raw_desc"
,'query_variation'
           ],
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color'
    # filter_by_topics = True
    # filter_by_qrels = True
)

display(results_all_des)

