# TREC-2022
## Search and Evaluation by Topics, Terms

In [2]:
import typing
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.2-cp38-cp38-win_amd64.whl (11.0 MB)
     ---------------------------------------- 11.0/11.0 MB 7.6 MB/s eta 0:00:00
Collecting numpy>=1.20.3
  Downloading numpy-1.23.5-cp38-cp38-win_amd64.whl (14.7 MB)
     ---------------------------------------- 14.7/14.7 MB 4.0 MB/s eta 0:00:00
Installing collected packages: numpy, pandas
Successfully installed numpy-1.23.5 pandas-1.5.2


In [4]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.5.2-py3-none-any.whl (385 kB)
     ------------------------------------- 385.3/385.3 kB 11.7 MB/s eta 0:00:00
Collecting elastic-transport<9,>=8
  Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)
     ---------------------------------------- 59.5/59.5 kB 3.3 MB/s eta 0:00:00
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.4.0 elasticsearch-8.5.2


In [6]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.5/78.5 kB 4.5 MB/s eta 0:00:00
Installing collected packages: tqdm
Successfully installed tqdm-4.64.1


In [18]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 7.9 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp38-cp38-win_amd64.whl (267 kB)
     -------------------------------------- 267.7/267.7 kB 8.3 MB/s eta 0:00:00
Collecting joblib
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 9.3 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.6/96.6 kB ? eta 0:00:00
Installing collected packages: regex, joblib, click, nltk
Successfully installed click-8.1.3 joblib-1.2.0 nltk-3.7 regex-2022.10.31


In [21]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import ast

from elasticsearch import Elasticsearch
from tqdm import tqdm

In [22]:
_OUR_SERVER = "http://210.117.182.30:9200"

***
## 1. Import Datasets
### Load Topics

In [8]:
topics = []

topic_file = ET.parse(open('datasets/topics-2022.xml', 'r', encoding='UTF8'))
root = topic_file.getroot()

for topic in root.iter("topic"):
    topics.append(topic.text)

print(f"Count: {len(topics)}")
print(f"Topic: {topics[0]}")

Count: 50
Topic: 
A 19-year-old male came to clinic with some sexual concern.  He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend's statement and he is not as muscular as his classmates.  On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH.



### Load Clinical & Bio Terms

In [9]:
clinical_terms = []
df = pd.read_csv('datasets/clinicalTerms-2022.csv')

# ast.literal_eval :
# "[ ... ]" -> [ ... ]
for terms in df['Problem']:
    clinical_terms.append(ast.literal_eval(terms))

bio_terms = []
df = pd.read_csv('datasets/bioTerms-2022.csv')

for terms in df['Bio']:
    bio_terms.append(ast.literal_eval(terms))

In [15]:
print(f'Counts: {len(clinical_terms)}, Topic_1: {clinical_terms[0]}')
print(f'Counts: {len(bio_terms)}, Topic_1: {bio_terms[0]}')

Counts: 50, Topic_1: ['some pubic hair', 'poorly developed secondary sexual characteristics', 'coffee smell', 'low levels of GnRH']
Counts: 50, Topic_1: ['satisfaction', 'visual acuity', 'testosterone', 'GnRH']


### Load Expanded terms by QEModule

In [12]:
expanded_query = []
df = pd.read_csv('datasets/expandedTerms-2022.csv')

for terms in df['Terms']:
    expanded_query.append(terms)

In [16]:
print(f'Counts: {len(expanded_query)}, Topic_1: {expanded_query[0]}')

Counts: 50, Topic_1: some pubic hair poorly developed secondary sexual characteristics coffee smell low levels of GnRHsatisfaction visual acuity testosterone GnRH fairly sexuality expressiveness heavily symphysis systems less primary drink primarily smells odor unique sex regrowth impairments schools skin imagery characteristic traits wine blond level higher


***
## 2. Elasticsearch

In [29]:
urls = [_OUR_SERVER]
es_host = Elasticsearch(urls, request_timeout=60)

# The list of fields to search
es_fields = ["brief_title",
             "brief_summary",
             "detailed_description",
             "inclusion_criteria^1.5",
             "exclusion_criteria"]

# The name of index
es_index = "idx-clinical-study-final"

### Make queries for Elasticsearch

In [25]:
from methods import get_first_sent, find_terms, get_should_list

# es_bodies: Specific queries for each topic
es_bodies = []

for i in range(len(topics)):
    terms = bio_terms[i] + clinical_terms[i]
    normal_text = " ".join(terms)
    expanded_text = expanded_query[i]    # From QEModule

    first_sent = get_first_sent(topics[i])
    first_terms = find_terms(first_sent, terms)
    first_terms = list(set(first_terms))   # Deduplication

    phrases = get_should_list(terms + first_terms, es_fields)

    body = {
        "from" : 0,
        "size" : 1000,
        "_source": ["nct_id"],
        "query": {
            "bool": {
                "must": [
                    # jbnu1
                    { "multi_match": { "query": topics[i] + " " + normal_text, "fields": es_fields, "analyzer": "std_analyzer"} },
                    # jbnu2
                    #{ "multi_match": { "query": topics[i] + " " + expanded_text, "fields": es_fields, "analyzer": "std_analyzer"} },
                ],
                "should": phrases
            }
        }
    }
    es_bodies.append(body)

In [28]:
es_bodies[0]

{'from': 0,
 'size': 1000,
 '_source': ['nct_id'],
 'query': {'bool': {'must': [{'multi_match': {'query': '\nA 19-year-old male came to clinic with some sexual concern.  He recently engaged in a relationship and is worried about the satisfaction of his girlfriend. He has a "baby face" according to his girlfriend\'s statement and he is not as muscular as his classmates.  On physical examination, there is some pubic hair and poorly developed secondary sexual characteristics. He is unable to detect coffee smell during the examination, but the visual acuity is normal. Ultrasound reveals the testes volume of 1-2 ml. The hormonal evaluation showed serum testosterone level of 65 ng/dL with low levels of GnRH.\n satisfaction visual acuity testosterone GnRH some pubic hair poorly developed secondary sexual characteristics coffee smell low levels of GnRH',
      'fields': ['brief_title',
       'brief_summary',
       'detailed_description',
       'inclusion_criteria^1.5',
       'exclusion_cri

### Search and Get responses from Elasticsearch

In [31]:
result_list = []

for i in tqdm(range(len(topics))):
    res = es_host.search(index=es_index, body=es_bodies[i])
    res = res.get("hits").get("hits")

    results = [] # list of (id, score)
    for doc in res: # 1000개
        id = doc.get('_source').get('nct_id')[0]
        score = doc.get('_score')
        results.append((id, score))

    result_list.append(results)

  res = es_host.search(index=es_index, body=es_bodies[i])
  res = es_host.search(index=es_index, body=es_bodies[i])
100%|██████████| 50/50 [01:21<00:00,  1.62s/it]


In [32]:
# 1번째 토픽의 1000번째 검색결과
print(f"ID, Score: {len(result_list)}")
print(f"ID, Score: {result_list[0][999]}")

ID, Score: 50
ID, Score: ('NCT00912470', 70.345894)


***
## 3. TREC_EVAL
### Write Result file

In [43]:
# RUN and SAVE file name
run_name = "run_jbnu1"

In [51]:
# TOPIC_NO Q0 ID RANK SCORE RUN_NAME
# TOPIC_NO: 질의 번호
# ID: 문서 번호
# RANK: 문서 등수 (1~1000등까지 문서 점수 순으로 출력, 최대 문서는 1000개까지)
# SCORE: 문서 점수
# RUN_NAME: 검색 결과 이름 (알파벳 12자까지 가능)
# ex) 1 Q0 NCT00760162 1 0.9999 my-run

def write_result_file(file, topic_no: int, result: tuple):
    for i, (id, score) in enumerate(result):
        i += 1
        file.write(f"{topic_no} Q0 {id} {i} {score} {run_name}\n")

def make_dir(dir_name: str):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

current_path = os.getcwd()
print(f'Now: {current_path}')

Now: C:\Users\Lee\PycharmProjects\search_script_for_TREC


In [58]:
trec_eval_path = f'{current_path}/trec_eval-9.0.7'

make_dir('results')
result_file = f'{current_path}/results/{run_name}.txt'
qrels_file = f'{current_path}/datasets/qrels-2022.txt'

with open(result_file, 'w') as file:
    topic_no = 1

    for each_result in result_list:
        write_result_file(file, topic_no, each_result)
        topic_no += 1

### Run TREC_EVAL program
#### Download -> https://trec.nist.gov/trec_eval/

In [61]:
# Move path to trec_eval repos.
os.chdir(trec_eval_path)

# For non-ndcg
cmd_others = f"./trec_eval -c -q -l 2 {qrels_file} {result_file}"
# For ndcg
cmd_ndcg = f"./trec_eval -c -q -m ndcg_cut {qrels_file} {result_file}"

In [62]:
stream = os.popen(cmd_others)
eval_others = stream.read()
eval_others = eval_others.split('\n')

stream = os.popen(cmd_ndcg)
eval_ndcg = stream.read()
eval_ndcg = eval_ndcg.split('\n')

# Write evaluated file and Move back to origin repos.
os.chdir(current_path)
make_dir('evaluations')
with open(f'evaluations/eval_{run_name}.txt', 'w') as f:
    for i in range(1350, len(eval_others)):
        f.write(eval_others[i] + '\n')

    for i in range(450, len(eval_ndcg)):
        f.write(eval_ndcg[i] + '\n')