# TREC-2022
## Query Expander by BERT
### jbnu2 질의확장에서 사용하기 위한 소스코드
### https://colab.research.google.com/github/fastforwardlabs/ff14_blog/blob/master/_notebooks/2020-07-22-Improving_the_Retriever_on_Natural_Questions.ipynb#scrollTo=1xwiI8KJ7D1V

In [1]:
import ast
import spacy
import pandas as pd
import gensim.downloader as api

from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# initialize models
nlp = spacy.load("en_core_web_sm")
word_vectors = api.load("glove-wiki-gigaword-50")
unmasker = pipeline('fill-mask', model="bert-base-uncased", tokenizer="bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

Downloading:   0%|          | 0.00/230 [00:00<?, ?B/s]

***
## 1. Import Datasets

In [3]:
bioTerms = pd.read_csv('datasets/bioTerms-2022.csv')
clinicalTerms = pd.read_csv('datasets/clinicalTerms-2022.csv')

clinical_terms = []
bio_terms = []

for terms in clinicalTerms['Problem']:
    clinical_terms.append(ast.literal_eval(terms))

for terms in bioTerms['Bio']:
    bio_terms.append(ast.literal_eval(terms))

print(f'Counts: {len(clinical_terms)}, Topic_1: {clinical_terms[0]}')
print(f'Counts: {len(bio_terms)}, Topic_1: {bio_terms[0]}')

Counts: 50, Topic_1: ['some pubic hair', 'poorly developed secondary sexual characteristics', 'coffee smell', 'low levels of GnRH']
Counts: 50, Topic_1: ['satisfaction', 'visual acuity', 'testosterone', 'GnRH']


***
## 2. Run Expanding Module

In [4]:
from qe_methods import QueryExpander

def get_expanded_query(text):
    qe_static = QueryExpander(text, entity_args, synonym_args)
    #qe_static.explain_expansion(entities=True)

    return qe_static.expanded_question

In [5]:
#Static Embedding Similarity
entity_args = { 'spacy_model': nlp }
synonym_args = { 'gensim_model': word_vectors, 'n_syns': 2 }

ex_clinicalTerms = []
ex_bioTerms = []
ex_mixedTerms = []

for i in tqdm(range(len(bio_terms))):
    ct, bt = ' '.join(clinical_terms[i]), ' '.join(bio_terms[i])
    ex_clinicalTerms.append(get_expanded_query(ct))
    ex_bioTerms.append(get_expanded_query(bt))

    ex_mixedTerms.append(get_expanded_query(ct + bt))

100%|██████████| 50/50 [01:02<00:00,  1.26s/it]


In [7]:
print("CT for topic_1:\n", clinical_terms[0])
print("Expanded CT for topic_1:\n", ex_clinicalTerms[0])

print("BT for topic_1:\n", bio_terms[0])
print("Expanded BT for topic_1:\n", ex_bioTerms[0])

CT for topic_1:
 ['some pubic hair', 'poorly developed secondary sexual characteristics', 'coffee smell', 'low levels of GnRH']
Expanded CT for topic_1:
 some pubic hair poorly developed secondary sexual characteristics coffee smell low levels of GnRH blond traits skin less odor sex characteristic higher level regrowth sexuality wine heavily symphysis fairly primarily drink smells systems schools primary
BT for topic_1:
 ['satisfaction', 'visual acuity', 'testosterone', 'GnRH']
Expanded BT for topic_1:
 satisfaction visual acuity testosterone GnRH impairments expressiveness unique expressing confidence imagery


***
## 3. Save expanded terms by CSV

In [9]:
o_path = 'outputs/'

df_output = pd.DataFrame({'Topic': range(1, 51), 'Terms': ex_clinicalTerms})
df_output.to_csv(o_path + 'expanded_CT-2022.csv', header=True, index=False)

df_output = pd.DataFrame({'Topic': range(1, 51), 'Terms': ex_bioTerms})
df_output.to_csv(o_path + 'expanded_BT-2022.csv', header=True, index=False)

df_output = pd.DataFrame({'Topic': range(1, 51), 'Terms': ex_mixedTerms})
df_output.to_csv(o_path + 'expandedTerms-2022.csv', header=True, index=False)