# Clinical support 2014 and 2015 task is:
Given a case report retrieve full-text biomedical articles that answer questions related to several types of clinical information needs.

The clinical Collection exploits the same topics for clinical trials retrieval.

# Intall Libraries, Run utlity functions, and import the queries (raw topics)

In [None]:
# https://ir-datasets.com/pmc.html#pmc/v1/trec-cds-2014
!pip install ir_datasets
!pip install python-terrier


In [None]:
import regex as re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import spacy
import json
import string
import warnings
warnings.filterwarnings("ignore")
import ir_datasets
from bs4 import BeautifulSoup
import regex as re
import gensim
import os
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from pyterrier.measures import *

import pyterrier as pt

if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)


## Paths

In [None]:
path_to_topics = './experiments/topics/cds_clinical/topics-2014_2015-description.topics'
path_to_adhoc_human = './experiments/topics/cds_clinical/adhoc-queries.json'
path_to_qrels = './experiments/qrels/cds_clinical/qrels-clinical_trials.txt"'

# Import the new collection for medical literature retrieval

In [None]:
dataset = pt.get_dataset('irds:pmc/v1')
# Index pmc/v1 -- 2014 and 2015 document collection
indexer = pt.IterDictIndexer('./indices/pmc_v1')
index_ref = indexer.index(dataset.get_corpus_iter(), fields=['journal', 'title', 'abstract', 'body'])

### ####load the index, print the statistics
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())
print(index.getMetaIndex().getKeys())


##To save the documents: 
# dataset = ir_datasets.load("pmc/v1/trec-cds-2014")
# collection  = pd.DataFrame(columns=['doc_id','title','abstract','body','journal'])
# for doc in dataset.docs_iter():
#   collection = collection.append({
#             'doc_id': doc.doc_id,
#             'title': doc.title,
#             'abstract': doc.abstract,
#             'body': doc.body,
#             'journal': doc.journal,
#             },ignore_index=True)
# display(collection.sample(45))
# display(collection.head(20))
# collection.to_csv('./2014_2015_PRECTRECDATA.csv')

## Import the raw topics

In [None]:
 with open(path_to_topics, 'r', encoding='utf-8',
                    errors='ignore') as document:
  d = document.read()
  soup = BeautifulSoup(d, 'xml')
  qid = soup.find_all('NUM')
  query = soup.find_all('TITLE')
  lq = []
  for i in qid: 
    lq.append(i.text)
  ld = []
  for x in query: 
    ld.append(x.text)

desc_tr = pd.DataFrame({'qid': lq,'query': ld})
display(desc_tr['query'].head())


print("Description")
desc_tr["query"] = desc_tr["query"].apply(lambda x: x.lower())
desc_tr["query"] = desc_tr["query"].apply(lambda x: strip_punctuation(x))
display(desc_tr.head(2))

## Import the human generated ad_hoc

In [None]:
adhoc_all = pd.DataFrame(columns=['qid', 'query'])
with open(path_to_adhoc_human, 'r') as document:
  d = json.load(document)
  for query in d: 
    qid = query['qId'].replace('trec','').replace('-','')
    queries = []
    for q_keywords in query['keywords']:
      query = q_keywords['keywords']
      queries.append(query)
    adhoc_all = adhoc_all.append({'qid': qid,'query': queries}, ignore_index=True)

def keep_unique_qterms(list_of_keywords):
  text = ' '.join(list_of_keywords)
  word_list = text.split()
  unique_words = set(word_list)
  return ' '.join(unique_words)

adhoc_all['final_query'] = adhoc_all['query'].apply(lambda x: keep_unique_qterms(x))
adhoc_all = adhoc_all.drop('query', axis=1)
adhoc_all = adhoc_all.rename(columns={"final_query": "query"})
print("ad-hoc - all merged - unique terms")
adhoc_all["query"] = adhoc_all["query"].apply(lambda x: x.lower())
adhoc_all["query"] = adhoc_all["query"].apply(lambda x: strip_punctuation(x))
display(adhoc_all.head(2))

## Import the automatically synthesized topics

In [None]:
def handle_empty_queries(query):
  query = str(query)
  if len(query)>=5:
    return query
  else: 
    return 'None'

def read_processed(path):
  processed_topics = pd.read_csv(path, names=['qid','query'],sep=",",header=0)
  processed_topics["qid"] = processed_topics["qid"].astype(str)
  processed_topics['query'] = processed_topics["query"].apply(lambda x: handle_empty_queries(x))
  processed_topics["query"] = processed_topics["query"].astype(str)
  return processed_topics


Q01_bert_problems_treats_test = read_processed('./experiments/topics/cds_clinical/reformulated_topics/Q01_bert_problems_treats_test.csv')

# Retrieval Experiments on the 2014 Collection/Topics

In [None]:
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"})
dataset2 = pt.get_dataset('irds:pmc/v1/trec-cds-2014')

# ## Retrieval with the various query Variations, the first 30 topics are related to the 2014 collection.
initial_res = bm25.transform(desc_zu[:30])
query_version1= bm25.transform(Q01_bert_problems_treats_test[:30])
initial_res_adhoc = bm25.transform(adhoc_all_zu[:30])

display(pt.Experiment(
    [initial_res
      ,query_version1 
     ,initial_res_adhoc
     ],
    adhoc_all_zu[:30],
    dataset2.get_qrels(),
     names=["raw_description"
            ,'Q01_bert_problems_treats_test'
           ,'human_adhoc'
           ],
    eval_metrics=[AP(rel=2)@1000,RR(rel=2)@1000,P(rel=2)@1,P(rel=2)@5,P(rel=2)@10,P(rel=2)@25,Rprec(rel=2),nDCG@5,nDCG@10,R(rel=2)@10,R(rel=2)@25,NumRet,NumRelRet(rel=2),NumRel,Bpref(rel=2)],
    baseline=0,
    # perquery = False,
    correction='b',
    highlight= 'color',
    # filter_by_topics = False,
    # filter_by_qrels = False
))

# # Retrieval Experiments on the 2015 Collection/Topics
 https://ir-datasets.com/pmc.html#pmc/v1/trec-cds-2015

In [None]:
dataset3 = pt.get_dataset('irds:pmc/v1/trec-cds-2015')
dataset4 = ir_datasets.load("pmc/v1/trec-cds-2015")
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25", properties={"termpipelines" : "Stopwords,PorterStemmer"}) #properties={"termpipelines" : "Stopwords,PorterStemmer"}


# ## Retrieval with the various query Variations
initial_res = bm25.transform(desc_zu[30::])
query_version1 = bm25.transform(Q01_bert_problems_treats_test[30:])
initial_res_adhoc = bm25.transform(adhoc_all_zu[30::])


display(pt.Experiment(
    [initial_res
      ,query_version1 
     ,initial_res_adhoc
     ],
    Q01_bert_problems_treats_test[30::],
    dataset3.get_qrels(),
     names=["raw_description"
,'Q01_bert_problems_treats_test'
           ,'human_adhoc'
           ],
    eval_metrics=[AP(rel=1)@1000,RR(rel=1)@1000,P(rel=1)@1,P(rel=1)@5,P(rel=1)@10,P(rel=1)@25,Rprec(rel=1),nDCG@5,nDCG@10,R(rel=1)@10,R(rel=1)@25,NumRet,NumRelRet(rel=1),NumRel,Bpref(rel=1)],
    baseline=0,
    perquery = False,
    correction='b',
    highlight= 'color',
    filter_by_topics = True
    # filter_by_qrels = True
))