# Workflow

Run the Baselines Experiments using the Clinical Collection. 

## Initiate PyTerrier

In [None]:
!pip install python-terrier
from google.colab import drive
from collections import defaultdict
from pathlib import Path
import pandas as pd
import os
import numpy as np
import gensim
import numpy as np
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
import pyterrier as pt
from pyterrier.measures import *
from bs4 import BeautifulSoup
import warnings
import json
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

drive.mount("/content/gdrive", force_remount=True)

## Paths

In [None]:
path_to_clinical_collection = ''
path_to_topics = './experiments/topics/cds_clinical/topics-2014_2015-description.topics'
path_to_adhoc_human = './experiments/topics/cds_clinical/adhoc-queries.json'
path_to_qrels = './experiments/qrels/cds_clinical/qrels-clinical_trials.txt"'

#  Load Clinical Collection (.csv), Queries (Desc. and human adhoc), Qrels

In [None]:
# Collection
collection = pd.read_csv(path_to_clinical_collection)
display(collection.head(1))

# Load Queries Description

In [None]:
 with open(path_to_topics, 'r', encoding='utf-8',
                    errors='ignore') as document:
  d = document.read()
  soup = BeautifulSoup(d, 'xml')
  qid = soup.find_all('NUM')
  query = soup.find_all('TITLE')
  lq = []
  for i in qid: 
    lq.append(i.text)
  ld = []
  for x in query: 
    ld.append(x.text)

desc_tr = pd.DataFrame({'qid': lq,'query': ld})
display(desc_tr['query'].head())


print("Description")
desc_tr["query"] = desc_tr["query"].apply(lambda x: x.lower())
desc_tr["query"] = desc_tr["query"].apply(lambda x: strip_punctuation(x))
display(desc_tr.head(2))

# Load Queries Human generated ad-hoc

In [None]:
adhoc_all = pd.DataFrame(columns=['qid', 'query'])
with open(path_to_adhoc_human, 'r') as document:
  d = json.load(document)
  for query in d: 
    qid = query['qId'].replace('trec','').replace('-','')
    queries = []
    for q_keywords in query['keywords']:
      query = q_keywords['keywords']
      queries.append(query)
    adhoc_all = adhoc_all.append({'qid': qid,'query': queries}, ignore_index=True)

def keep_unique_qterms(list_of_keywords):
  text = ' '.join(list_of_keywords)
  word_list = text.split()
  unique_words = set(word_list)
  return ' '.join(unique_words)

adhoc_all['final_query'] = adhoc_all['query'].apply(lambda x: keep_unique_qterms(x))
adhoc_all = adhoc_all.drop('query', axis=1)
adhoc_all = adhoc_all.rename(columns={"final_query": "query"})
print("ad-hoc - all merged - unique terms")
adhoc_all["query"] = adhoc_all["query"].apply(lambda x: x.lower())
adhoc_all["query"] = adhoc_all["query"].apply(lambda x: strip_punctuation(x))
display(adhoc_all.head(2))

# Load pre-processed Queries

In [None]:
def handle_empty_queries(query):
  query = str(query)
  if len(query)>=5:
    return query
  else: 
    return 'None'

def read_processed(path):
  processed_topics = pd.read_csv(path, names=['qid','query'],sep=",",header=0)
  processed_topics["qid"] = processed_topics["qid"].astype(str)
  processed_topics['query'] = processed_topics["query"].apply(lambda x: handle_empty_queries(x))
  processed_topics["query"] = processed_topics["query"].astype(str)
  return processed_topics


Q01_bert_problems_treats_test = read_processed('./experiments/topics/cds_clinical/reformulated_topics/Q01_bert_problems_treats_test.csv')



# Experiments

Query formulation:
> 1.   Description Queries 
> 2.   Human Generated ad-hoc


Retrieval Models: 
> 1.   BM25

Document Representation:
> 1.   Indexed all the available meaningfull sections

In [None]:
#Index 
##Code here

# #Models
bm25 = pt.BatchRetrieve(index, wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}) #properties={"termpipelines" : "Stopwords,PorterStemmer"}

# ## Retrieval with the various query Variations
initial_res = bm25.transform(desc_tr)
query_version1 = bm25.transform(Q01_bert_problems_treats_test)
initial_res_adhoc = bm25.transform(adhoc_all)


#Evaluate
results_all_des = pt.Experiment(
    [initial_res
,query_version1 
      ,initial_res_adhoc
     ],
    desc_tr,
    pt.io.read_qrels(path_to_qrels),

    eval_metrics=[AP(rel=1)@1000,RR(rel=1)@1000,P(rel=1)@1,P(rel=1)@5,P(rel=1)@10,P(rel=1)@25,Rprec(rel=1),nDCG@5,nDCG@10,R(rel=1)@10,R(rel=1)@25,NumRet,NumRelRet(rel=1),NumRel,Bpref(rel=1)],
    names=["raw_description"
,'Q01_bert_problems_treats_test'
           ,'human_adhoc'
           ],
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color'
    # filter_by_topics = True
    # filter_by_qrels = True
)

display(results_all_des)