## Elasticsearch Main Source Code

### Find Phrase

In [1]:
from Files_to_topics import get_phrases

path = "./topics/2021_query_CT_edit.xml"
phrase_query = get_phrases(path)

print(f"Count: {len(phrase_query)}")
print(f"Topic: {phrase_query[0]}")

Count: 75
Topic: ['anaplastic astrocytoma', 'urinary retention', 'chronic pain', 'anaplastic astrocytoma', 'urinary retention', 'anaplastic astrocytoma']


### Find Topics

In [2]:
from Files_to_topics import get_topics

path = "./topics/topics2021.xml"
topics = get_topics(path)
    
print(f"Count: {len(topics)}")
print(f"Topic: {topics[0]}")

Count: 75
Topic: 
Patient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. 



### Find patient's information

In [3]:
## We don't use this part in search now.

from Topics_to_patients import get_genders

gender_query = get_genders(topics)
print(f"Gender: {gender_query[0]}")

from Topics_to_patients import get_years

year_query = get_years(topics)
print(f"Year: {year_query[0]}")

Gender: Male
Year: 45


### Elasticsearch#1 Making Query

In [4]:
from elasticsearch import Elasticsearch

urls = ["http://210.117.182.30:9200"]
es_host = Elasticsearch(urls, request_timeout=60)

# The list of fields to search
es_fields = ["brief_title",
            "brief_summary",
            "detailed_description",
            "inclusion_criteria^1.5",
            "exclusion_criteria"]

# The name of index
es_index = "idx-clinical-study-v5"

In [5]:
from Query_functions import get_should_list

# es_bodies: Specific query for each topic
es_bodies = []
for i in range(len(topics)):
    phrases = get_should_list(phrase_query[i], es_fields)
    body = {
        "from" : 0,
        "size" : 1000,
        "_source": ["nct_id"], 
        "query": {
            "bool": {
                "must": [
                    { "multi_match": { "query": topics[i], "fields": es_fields, "analyzer": "std_analyzer"} },
                ],
                "should": phrases
            }
        }
    }
    es_bodies.append(body)

In [6]:
es_bodies[0]

{'from': 0,
 'size': 1000,
 '_source': ['nct_id'],
 'query': {'bool': {'must': [{'multi_match': {'query': '\nPatient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. \n',
      'fields': ['brief_title',
       'brief_summary',
       'detaile

### Elasticsearch#2 Search & Get responses

In [7]:
from tqdm import tqdm

es_responses = []

for i in tqdm(range(len(topics))):
    res = es_host.search(index=es_index, body=es_bodies[i])
    es_responses.append(res.get("hits").get("hits"))

# 1번째 토픽의 1000번째 검색결과
es_responses[0][999]

100%|██████████| 75/75 [04:26<00:00,  3.55s/it]


{'_index': 'idx-clinical-study-v5',
 '_type': '_doc',
 '_id': 'NCT02041169',
 '_score': 67.84117,
 '_source': {'nct_id': ['NCT02041169']}}

In [8]:
result_list = []

for res in es_responses: # 75개
    results = [] # list of (id, score)
    
    for doc in res: # 1000개
        id = doc.get('_source').get('nct_id')[0]
        score = doc.get('_score')
        results.append((id, score))
    
    result_list.append(results)
    
print(f"ID, Score: {len(result_list)}")
print(f"ID, Score: {result_list[0][999]}")

ID, Score: 75
ID, Score: ('NCT02041169', 67.84117)


### TREC Evaluation

In [9]:
# TOPIC_NO Q0 ID RANK SCORE RUN_NAME
# TOPIC_NO: 질의 번호
# ID: 문서 번호
# RANK: 문서 등수 (1~1000등까지 문서 점수 순으로 출력, 최대 문서는 1000개까지)
# SCORE: 문서 점수
# RUN_NAME: 검색 결과 이름 (알파벳 12자까지 가능)
# ex) 1 Q0 NCT00760162 1 0.9999 my-run
def write_result_file(file, topic_no, result):  
    for i, (id, score) in enumerate(result):
        i += 1
        file.write(f"{topic_no} Q0 {id} {i} {score} {run_name}\n")
        
def make_dir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

import os

# RUN and SAVE file name
run_name = "BM25_default"

path_now = os.getcwd()
print(path_now)

/home2/ukyoung/es_search_py


In [10]:
trec_eval_path = '/home2/ukyoung/my-trec/trec_eval-9.0.7'

make_dir('results')
result_file = f'{path_now}/results/{run_name}.txt'
qrels_file = f'{trec_eval_path}/qrels/qrels2021.txt'

#print(f"Result File is.. {result_file}")
#print(f"Relevance File is.. {qrels_file}")
#print(f"TREC_eval Path is.. {trec_eval_path}")

with open(result_file, 'w') as file:
    topic_no = 1

    for each_result in result_list:
        write_result_file(file, topic_no, each_result)
        topic_no += 1

# trac_eval 경로로 이동
os.chdir(trec_eval_path)

# 질의별 보기: -q
# 모든 정보 검색 평가 방식 보기: -m all_trec
#command = f"./trec_eval -q -m all_trec {qrels_file} {result_file}"
command = f"./trec_eval -m all_trec {qrels_file} {result_file}"

stream = os.popen(command)
eval_results = stream.read()

e = eval_results.split('\n')

show_list = 0,1,2,3,4,5,7,22,59
for s in show_list:
    print(e[s])

runid                 	all	BM25_default
num_q                 	all	75
num_ret               	all	75000
num_rel               	all	11589
num_rel_ret           	all	4545
map                   	all	0.1370
Rprec                 	all	0.1939
P_10                  	all	0.4333
ndcg_cut_10           	all	0.3330


### Save as text File

In [11]:
# 파이썬 실행 경로로 이동
os.chdir(path_now)
make_dir('trec_evals')
eval_path = f"{path_now}/trec_evals/{run_name}.txt"

with open(eval_path, 'w') as file:
    file.write(eval_results)
    #print(eval_results)