Input: A set of topics in free-text, creates various query representations based on identified, drugs, dosages, problems, treatments, tests

# Intall Libraries, Run utlity functions, and import the queries (raw topics)

In [None]:
!pip install regex
!pip install transformers
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install medspacy
!pip install spacy==3.0.6
!pip install git+https://github.com/explosion/spacy-transformers
!pip install tqdm

In [None]:
import regex as re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from transformers import pipeline
import scispacy
import spacy
import string
import medspacy
from bs4 import BeautifulSoup
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

##Model for NER recognition
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

##Model for Negated content identification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
tokenizer2 = AutoTokenizer.from_pretrained("bvanaken/clinical-assertion-negation-bert")
model2 = AutoModelForSequenceClassification.from_pretrained("bvanaken/clinical-assertion-negation-bert")

## Select the collection and its related topics to create the reformulated topics

In [None]:
collection = 'cds_clinical' #trec_clc for trec2021 clinical or cds_clinical for the remaining 3 collections as they share the same topics

if collection == 'trec_clc':
  path_to_topics = './experiments/topics/trec_clc/topics2021.txt'
  path_to_load_entities = './experiments/topics/trec_clc/extracted_med_entities/'
  save_reform_queries = './experiments/topics/trec_clc/reformulated_topics/'

elif collection == 'cds_clinical':
  path_to_topics = './topics-2014_2015-description.topics'
  path_to_load_entities = './experiments/topics/cds_clinical/extracted_med_entities/'
  save_reform_queries = './experiments/topics/cds_clinical/reformulated_topics/'

else:
  raise Exception("Invalid selection of topics. Use appropriate collections (trec_clc for TREC 2021 clinical. cds_clinical for cds and clinical collections)")


## Utility Functions

In [None]:
"""Gets a topic returns sentences/segments splitted using spacy model
"""
nlp_sentence= spacy.load("en_core_sci_sm")
def split_sent(topic,nlp_sentence):
  doc = nlp_sentence(topic)
  sentences = list(doc.sents)
  return sentences

"""Get a sentence identifies/returns NERs -- MODEL samrawal/bert-base-uncased_clinical-ner [Problem, treatment, test]
"""
def get_entities(sentence):
  nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
  topic_ent = nlp(str(sentence))
  return topic_ent

"""Get a sentence and all its N entities, marked as problem; outputs N senteneces in which the entily is marked ['entity']. This format is required by the transformer based model.
"""
def modify_segment_negation(sentence, topic_ent,extracted_entity: list): 
  modified_sentences = []
  inserttxt = "[entity]"
  for ent in topic_ent:
    if ent['entity_group'] == 'problem' and extracted_entity[0] == 'problem':
      modified_sent = sentence[:int(ent['start'])] + inserttxt + ' ' + sentence[int(ent['start']):int(ent['end'])] + ' ' + inserttxt + sentence[int(ent['end']):]
      modified_sentences.append(modified_sent)
    elif ent['entity_group'] == 'treatment'and extracted_entity[1] == 'treatment':
      modified_sent = sentence[:int(ent['start'])] + inserttxt + ' ' + sentence[int(ent['start']):int(ent['end'])] + ' ' + inserttxt + sentence[int(ent['end']):]
      modified_sentences.append(modified_sent)
    elif ent['entity_group'] == 'test' and extracted_entity[2] == 'test':
      modified_sent = sentence[:int(ent['start'])] + inserttxt + ' ' + sentence[int(ent['start']):int(ent['end'])] + ' ' + inserttxt + sentence[int(ent['end']):]
      modified_sentences.append(modified_sent)
  return modified_sentences

"""Get each sentence, the model for negation recognition and the topic's entities; returns the classified entities.
"""
def get_negations(sentence, model2, tokenizer2,topic_ent):
  classifier = TextClassificationPipeline(model=model2, tokenizer=tokenizer2)
  classification = classifier(sentence)
  return classification,topic_ent

"""Get a list of extracted entities and return the ad-hoc query for search with PyTerrier
"""
def create_adhoc(list_ner):
  if list_ner:
    return ' '.join(list_ner).replace("\'"," ").translate(str.maketrans(' ', ' ', string.punctuation))
  else:
    print("Empty Entity list")
    return None

"""Get a sentence and all its entities, uses the context algorithm on the entities and identifies their class
"""
def apply_context_med_entities(sentence, topic_ent,nlp_context): 
  target_matcher = nlp_context.get_pipe("medspacy_target_matcher")
  target_rules_ = []
  entities = []
  
  #Adds the identified entity to the medspacy_target so this entity can be lated categorized by Context
  for ent1 in topic_ent:
    target_rules_.append(TargetRule(ent1['word'],ent1['entity_group']))
    target_matcher.add(target_rules_)

  doc = nlp_context(sentence)

  #If topic_ent catches sentences that do not have an identified entity
  if topic_ent:
    for ent in doc.ents:
      # entities.append([ent1['word'], ent._.is_negated, ent._.is_uncertain, ent._.is_family, ent._.is_historical, ent._.is_hypothetical])
      entities.append([ent1['word'], ent._.is_negated, ent._.is_family, ent._.is_historical])
      break

  for ent in doc.ents:
    # entities.append([ent, ent._.is_negated, ent._.is_uncertain, ent._.is_family, ent._.is_historical, ent._.is_hypothetical])
    entities.append([ent, ent._.is_negated, ent._.is_family, ent._.is_historical])
  return entities

"""Transforms the columns of a dataframe to list
TODO: Use a more efficient way.""" 
def col_to_list(df): 
  for column in df.columns:
    df[column] = df[column].values.tolist()
  df = df.fillna(' ')
  return df

"""Receives as input the dataframe with the lists of extracted NER per category and merges them into one
"""
def create_adhoc_from_csv(x):
  x = list(x)
  a1 = ''
  for a in x: 
    a1 = a1 + ' '+ a.translate(str.maketrans(' ', ' ', string.punctuation))
  return a1

"""Expand the spacy nlp with abbreviation detector and umls expander
"""
def create_nlp_umls(nlp_sentence):
  nlp_sentence.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", "max_entities_per_mention": 1, 'k':10})
  nlp_sentence.add_pipe('abbreviation_detector')
  linker = nlp_sentence.get_pipe("scispacy_linker")
  return nlp_sentence,linker

nlp_umls,linker = create_nlp_umls(nlp_sentence)

## Load topics

In [None]:
def load_topics(path_to_topics, collection): 
  if collection == 'trec_clc':
    with open(path_to_topics, 'r', encoding='utf-8',
                    errors='ignore') as document:
      d = document.readlines()
      count = 0
      # Strips the newline character
      ld = []
      lid = []
      for line in d:
        count += 1
        query = re.findall(r'^<topic number=\"\d+\">(.*)</topic>$',line,re.DOTALL)[0]
        ld.append(query)
        lid.append(count)
      desc_tr = pd.DataFrame({'qid':lid,'query': ld})
      display(desc_tr['query'].head())
      return desc_tr
  elif collection == 'cds_clinical':
    with open(path_to_topics, 'r', encoding='utf-8',
                    errors='ignore') as document:
      d = document.read()
      soup = BeautifulSoup(d, 'xml')
      qid = soup.find_all('NUM')
      query = soup.find_all('TITLE')
      lq = []
      for i in qid: 
        lq.append(i.text)
      ld = []
      for x in query: 
        ld.append(x.text)
    desc_tr = pd.DataFrame({'qid': lq,'query': ld})
    display(desc_tr['query'].head())
    return desc_tr
  else: 
    raise Exception("No topics have been loaded! Use appropriate collections (trec_clc, csd,clinical)")

desc_tr = load_topics(path_to_topics,collection)
desc_tr['qid'] = desc_tr['qid'].astype(str)

0    A 58-year-old African-American woman presents ...
1    An 8-year-old male presents in March to the ER...
2    A 58-year-old nonsmoker white female with mild...
3    A 2-year-old boy is brought to the emergency d...
4    A 56-year-old female on 20th day post-left mas...
Name: query, dtype: object

## LOAD the extracted Drugs and Dosages for the selected Topics

Reads the extracted Drugs and Dosages from various models. These codes can be found in the corresponding folders.

In [None]:
method = 'dis_chem_stanza'

if method =='chem_med7':
  path = path_to_load_entities +method+'.csv'
  chem_med7 = pd.read_csv(path,  dtype = {'TOPIC': str}) 
  chem_med7 = col_to_list(chem_med7)
  chem_med7 = chem_med7.rename(columns={"TOPIC": "qid"})
  chem_med7['query'] = chem_med7[chem_med7.columns[1:3]].apply(lambda x: create_adhoc_from_csv(x),axis=1)
  display(chem_med7.head(2))

elif method =='dis_chem_bio_bert':
  path = path_to_load_entities +method+'.csv'
  dis_chem_bio_bert = pd.read_csv(path,  dtype = {'TOPIC': str})
  dis_chem_bio_bert = col_to_list(dis_chem_bio_bert)
  dis_chem_bio_bert = dis_chem_bio_bert.rename(columns={"TOPIC": "qid"})
  dis_chem_bio_bert['query'] = dis_chem_bio_bert[dis_chem_bio_bert.columns[1:3]].apply(lambda x: create_adhoc_from_csv(x),axis=1)
  display(dis_chem_bio_bert.head(2))

elif method =='dis_chem_scispacy':
  path = path_to_load_entities +method+'.csv'
  dis_chem_scispacy = pd.read_csv(path,  dtype = {'TOPIC': str})
  dis_chem_scispacy = col_to_list(dis_chem_scispacy)
  dis_chem_scispacy = dis_chem_scispacy.rename(columns={"TOPIC": "qid"})
  dis_chem_scispacy['query'] = dis_chem_scispacy[dis_chem_scispacy.columns[1:3]].apply(lambda x: create_adhoc_from_csv(x),axis=1)
  display(dis_chem_scispacy.head(2))

elif method =='dis_chem_stanza':
  path = path_to_load_entities +method+'.csv'
  dis_chem_stanza = pd.read_csv(path,  dtype = {'TOPIC': str})
  dis_chem_stanza = col_to_list(dis_chem_stanza)
  dis_chem_stanza = dis_chem_stanza.rename(columns={"TOPIC": "qid"})
  dis_chem_stanza['query'] = dis_chem_stanza[dis_chem_stanza.columns[1:3]].apply(lambda x: create_adhoc_from_csv(x),axis=1)
  display(dis_chem_stanza.head(2))




Unnamed: 0,qid,query,DISEASE,CHEMICAL
0,20141,A 58yearold AfricanAmerican woman presents to...,"['chest pain', 'pain', 'nausea', 'diaphoresis'...",['smoking']
1,20142,An 8yearold male presents in March to the ER ...,"['fever', 'dyspnea', 'cough', 'fever', 'cough'...",


# Functions for identification of various NERs in the queries

## Identify Negated NERs using BERT and NegBert

Models:
1. NER - bert-base-uncased_clinical-ner 
2. Negated NERs - bvanaken/clinical-assertion-negation-bert

The model expects input in the form of **spans/sentences with one marked entity to classify** as PRESENT(0), ABSENT(1) or POSSIBLE(2). The entity in question is identified with the special token [entity] surrounding it.

1. Input: Raw Topics, information needed to classify (present_ners,negated_ners,possible_ners), NERs to be extracted list: ['problem','treatment','test']
2. Output: one dedicated list with either the negated, possible and present problems, treatments or tests

Steps: 
1. Given a query (raw topic), identify its entities using.
2. Then split it in sentences and put the [entity] token around it. (so that the BERT model can predict this entities status)
3. If a sentence contains two entities, two distinct inputs for the model are created. 
4. Finally, for each query this method provides three list of entities, that contain the negated, possible and present entities. 

In [None]:
def identify_ners(topic: str,clasify_information: str, extracted_NER: list, keep_noNER_sent: bool, remove_negated_NERS: bool):
  #Split the topic into sentences using the en_core_sci_sm spacy model
  sentences = split_sent(topic,nlp_sentence)
  #Initiate list of terms 
  list_of_terms_present = []
  list_of_terms_absent = []
  list_of_terms_possible = []
  sentences_with_no_entities = []

  for sentence in sentences:
    #Use bert-base-uncased_clinical-ner to extract [Problem, treatment, test]
    topic_ent = get_entities(sentence)
    #Add the required tokens in the segment to identify Negated using NegBert
    ready_sentences = modify_segment_negation(str(sentence), topic_ent,extracted_NER)
    classification,topic_ent  = get_negations(ready_sentences, model2, tokenizer2, topic_ent)
    #Keep sentences that have no identified NER
    if not topic_ent:
      sentences_with_no_entities.append(str(sentence))

    #Create list for each entity, based on their identified type
    for el in zip(topic_ent,classification):
      if str(el[1]['label']) == 'PRESENT':
        list_of_terms_present.append(str(el[0]['word'])) 
      if str(el[1]['label']) == 'ABSENT':
        list_of_terms_absent.append(str(el[0]['word'])) 
      if str(el[1]['label']) == 'POSSIBLE': 
        list_of_terms_possible.append(str(el[0]['word']))

  final_ners_present = '|'.join(list_of_terms_present).replace('|##', '').split('|')
  final_ners_absent = '|'.join(list_of_terms_absent).replace('|##', '').split('|')
  final_ners_possible = '|'.join(list_of_terms_possible).replace('|##', '').split('|')
  
  #returns all extracted NERS
  if remove_negated_NERS == False:
    if keep_noNER_sent:
      return (final_ners_present+final_ners_absent+final_ners_possible+sentences_with_no_entities)
    else: 
      return (final_ners_present+final_ners_absent+final_ners_possible)
  else:
    #returns the requested extracted NERS

    #returns only the present problems,treatments or test
    if clasify_information == 'present_ners':
      if not final_ners_present[0]: 
        print('No information.')
        return None
      else:
        if keep_noNER_sent:
          return (final_ners_present+sentences_with_no_entities)
        else: 
          return final_ners_present
    #returns only the negated problems,treatments or test
    elif clasify_information == 'negated__ners':
      if not final_ners_absent[0]: 
        print('No information.')
        return None
      else:
        if keep_noNER_sent:
          return (final_ners_absent+sentences_with_no_entities)
        else: 
          return final_ners_absent
    #returns only the possible problems,treatments or test
    elif clasify_information == 'possible_ners':
      if not final_ners_possible[0]: 
        print('No information.')
        return None
      else:
        if keep_noNER_sent:
          return (final_ners_possible+sentences_with_no_entities)
        else: 
          return final_ners_possible

          
display(desc_tr['query'][33])
print('All identified NERS + Sentences without NERS \n') #keep_noNER_sent: bool, remove_negated_NERS: bool
display(create_adhoc(identify_ners(desc_tr['query'][33],'present_ners',['problem','treatment','test'],True,False)))
print('All not_negated NERS \n')
display(create_adhoc(identify_ners(desc_tr['query'][33],'present_ners',['problem','treatment','test'],False,True)))
print('All not_negated NERS + Sentences without NERS \n')
display(create_adhoc(identify_ners(desc_tr['query'][33],'present_ners',['problem','treatment','test'],True,True)))
print('All NERS\n')
display(create_adhoc(identify_ners(desc_tr['query'][33],'present_ners',['problem','treatment','test'],False,False)))

'An 82-year-old woman comes to the emergency department because of chest pain and shortness of breath after being awakened in the morning by stabbing substernal chest pain radiating to the left shoulder and jaw. The patient had hypertension, renal-artery stenosis with chronic renal insufficiency, hypercholesterolemia, osteoporosis and dementia. Blood pressure was 199/108 mm Hg, respiratory rate 18 bpm, oxygen saturation 98% on ambient air. The heart sounds were rapid and with no murmurs. CK-MB was 10.9 ng/ml, CK was 89 U/l, CK index was 12.2% and Troponin T was 0.40 ng/ml. An EKG showed sinus regular tachycardia of 119 bpm, with ST-segment elevations up to 3 mm in V1, V2, and V3. A chest radiograph showed low lung volumes and right basilar subsegmental atelectasis. Coronary angiography showed no stenosis or clinically significant disease. Left ventriculography revealed akinesis of the anterior wall, hypokinesis of the apical and distal inferior walls, and compensatory hyperkinesis of t

All identified NERS + Sentences without NERS 



'chest pain shortness of breath stabbing substernal chest pain hypertension renal  artery stenosis chronic renal insufficiency hypercholesterolemia osteoporosis dementia blood pressure respiratory rate oxygen saturation sounds ck  mb ck ck index troponin t an ekg sinus regular tachycardia st  segment elevations a chest radiograph low lung volumes right basilar subsegmental atelectasis coronary angiography left ventriculography akinesis of the anterior wall hypokinesis of the apical and distal inferior walls compensatory hyperkinesis of the basal anterior and basal inferior walls a transthoracic echocardiogram severe segmental left ventricular dysfunction anterosept the overall left ventricular systolic function mildly impaired mild mitral regurgitation murmurs stenosis clinically significant disease  \n        '

All not_negated NERS 



'chest pain shortness of breath stabbing substernal chest pain hypertension renal  artery stenosis chronic renal insufficiency hypercholesterolemia osteoporosis dementia blood pressure respiratory rate oxygen saturation sounds ck  mb ck ck index troponin t an ekg sinus regular tachycardia st  segment elevations a chest radiograph low lung volumes right basilar subsegmental atelectasis coronary angiography left ventriculography akinesis of the anterior wall hypokinesis of the apical and distal inferior walls compensatory hyperkinesis of the basal anterior and basal inferior walls a transthoracic echocardiogram severe segmental left ventricular dysfunction anterosept the overall left ventricular systolic function mildly impaired mild mitral regurgitation'

All not_negated NERS + Sentences without NERS 



'chest pain shortness of breath stabbing substernal chest pain hypertension renal  artery stenosis chronic renal insufficiency hypercholesterolemia osteoporosis dementia blood pressure respiratory rate oxygen saturation sounds ck  mb ck ck index troponin t an ekg sinus regular tachycardia st  segment elevations a chest radiograph low lung volumes right basilar subsegmental atelectasis coronary angiography left ventriculography akinesis of the anterior wall hypokinesis of the apical and distal inferior walls compensatory hyperkinesis of the basal anterior and basal inferior walls a transthoracic echocardiogram severe segmental left ventricular dysfunction anterosept the overall left ventricular systolic function mildly impaired mild mitral regurgitation \n        '

All NERS



'chest pain shortness of breath stabbing substernal chest pain hypertension renal  artery stenosis chronic renal insufficiency hypercholesterolemia osteoporosis dementia blood pressure respiratory rate oxygen saturation sounds ck  mb ck ck index troponin t an ekg sinus regular tachycardia st  segment elevations a chest radiograph low lung volumes right basilar subsegmental atelectasis coronary angiography left ventriculography akinesis of the anterior wall hypokinesis of the apical and distal inferior walls compensatory hyperkinesis of the basal anterior and basal inferior walls a transthoracic echocardiogram severe segmental left ventricular dysfunction anterosept the overall left ventricular systolic function mildly impaired mild mitral regurgitation murmurs stenosis clinically significant disease '

## Identify Negated, Family History and Temporal/historical Info ConText algorithm

Models:
1. NERs - NERs -- MODEL samrawal/bert-base-uncased_clinical-ner [Problem, treatment, test]
2. NER - using Med spacy
3. Med_spacy for applying ConText and categorizing the entities


1. Input: Raw Topics, information to be classified non_negated,family_history, historical_information
2. Output: One list with the NOT ent._.is_negated (so present information), ent._.is_family, ent._.is_historical

Steps: 
1. Split the topic into sentences using spacy sci model
2. Get entities with the NERs Bert model
3. Annotate the entities with the ConText Algorithm

In [None]:
#TODO: Work on its efficiency
def apply_context(topic: str,extracted_information: list,keep_noNER_sent: bool):
  #Splits the topic into sentences using nlp spacy
  nlp_context = medspacy.load()
  sentences = split_sent(topic,nlp_sentence)
  topic_entities = []
  #Not_negated,
  list_of_terms_present = []
  #family history
  list_of_terms_family = []
  #historical
  list_of_terms_historical = []
  #Not_negated, not family, not historical
  list_of_terms_000 = []
  #Not_negated,family, not historical
  list_of_terms_010 = []
  #Not_negated, not family history, historical
  list_of_terms_001 = []
  #Not_negated, family history, historical
  list_of_terms_011 = []

  sentences_with_no_entities = []

  for sentence in sentences:
    #Using bert-base-uncased_clinical-ner identify [Problem, treatment, test]
    topic_ent = get_entities(sentence)
    #For each sentence, the entity is categorized as negated, family or historical
    ready_sentences = apply_context_med_entities(str(sentence), topic_ent,nlp_context)
    topic_entities.append(ready_sentences)
  
    if not topic_ent:
      sentences_with_no_entities.append(str(sentence))

  #Create the query based on the entitie type:
  for sent_entities in topic_entities:
    for entity in sent_entities:
      #Creates list with not_negated information
      if entity[1] == False: 
        list_of_terms_present.append(str(entity[0]))
      #Creates list with all family related information
      if entity[2] == True:
        list_of_terms_family.append(str(entity[0]))
      #Creates list with all historical information
      if entity[3] == True:
        list_of_terms_historical.append(str(entity[0]))
      #Creates list with not_negated, not family history and not historical information [combinations]
      if entity[1] == False and entity[2] == False and entity[3] == False:
        list_of_terms_000.append(str(entity[0]))
      #Creates list with not_negated, family history and not historical information [combinations]
      if entity[1] == False and entity[2] == True and entity[3] == False:
        list_of_terms_010.append(str(entity[0]))
      #Creates list with not_negated, not family history and historical information [combinations]
      if entity[1] == False and entity[2] == False and entity[3] == True:
        list_of_terms_001.append(str(entity[0]))
      #Creates list with not_negated, family history and historical information [combinations]
      if entity[1] == False and entity[2] == True and entity[3] == True:
        list_of_terms_011.append(str(entity[0]))

  final_ners_present = '|'.join(list_of_terms_present).replace('|##', '').split('|')
  final_ners_fam = '|'.join(list_of_terms_family).replace('|##', '').split('|')
  final_ners_hist = '|'.join(list_of_terms_historical).replace('|##', '').split('|')
  final_ners_000 = '|'.join(list_of_terms_000).replace('|##', '').split('|')
  final_ners_010 = '|'.join(list_of_terms_010).replace('|##', '').split('|')
  final_ners_001 = '|'.join(list_of_terms_001).replace('|##', '').split('|')
  final_ners_011 = '|'.join(list_of_terms_011).replace('|##', '').split('|')

  # print('present',final_ners_present,'\n')
  # print('family',final_ners_fam,'\n')
  # print('historical',final_ners_hist,'\n')
  # print('present,not_fam,not_hist',final_ners_000,'\n')
  # print('present,fam,not_hist',final_ners_010,'\n')
  # print('present,not_fam,hist',final_ners_001,'\n')
  # print('present,fam,hist',final_ners_011,'\n')

  #Returns all not_negated NERs - Problems, treatments, tests
  if extracted_information[0] == 'not_negated' and extracted_information[1] == 'None'and extracted_information[2] == 'None':
    if not final_ners_present[0]:
      print('All information is negated.')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_present+sentences_with_no_entities)
      else: 
          return final_ners_present
  #Returns all family_related NERs - Problems, treatments, tests
  if extracted_information[0] == 'None' and extracted_information[1] == 'family_history'and extracted_information[2] == 'None':
    if not final_ners_fam[0]:
      print('No family history.')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_fam+sentences_with_no_entities)
      else: 
          return final_ners_fam
  #Returns all historical NERs - Problems, treatments, tests
  if extracted_information[0] == 'None' and extracted_information[1] == 'None'and extracted_information[2] == 'historical_information':
    if not final_ners_hist[0]:
      print('No past medical history.')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_hist+sentences_with_no_entities)
      else: 
        return final_ners_hist
  #Returns all not_negated, not_family,not_historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[0] == 'not_negated' and extracted_information[1] == 'not_family_history'and extracted_information[2] == 'not_historical_information':
    if not final_ners_000[0]:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_000+sentences_with_no_entities)
      else: 
          return final_ners_000
  #Returns all not_negated, family NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[0] == 'not_negated' and extracted_information[1] == 'family_history' and extracted_information[2] == 'not_historical_information':
    if not final_ners_010:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_010+sentences_with_no_entities)
      else: 
          return final_ners_010
  #Returns all not_negated, not_family, historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[0] == 'not_negated' and extracted_information[1] == 'not_family_history'and extracted_information[2] == 'historical_information':
    if not final_ners_001:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_001+sentences_with_no_entities)
      else: 
          return final_ners_001
  #Returns all not_negated, family, historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if (extracted_information[0] == 'not_negated' and extracted_information[1] == 'family_history' and extracted_information[2] == 'historical_information'):
    if not final_ners_011:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return (final_ners_011+sentences_with_no_entities)
      else: 
          return final_ners_011

display(desc_tr['query'][33])
print(create_adhoc(apply_context(desc_tr['query'][33], ['not_negated','not_family_history','not_historical_information'],True)))
print(create_adhoc(apply_context(desc_tr['query'][33], ['not_negated','not_family_history','not_historical_information'],False)))



'An 82-year-old woman comes to the emergency department because of chest pain and shortness of breath after being awakened in the morning by stabbing substernal chest pain radiating to the left shoulder and jaw. The patient had hypertension, renal-artery stenosis with chronic renal insufficiency, hypercholesterolemia, osteoporosis and dementia. Blood pressure was 199/108 mm Hg, respiratory rate 18 bpm, oxygen saturation 98% on ambient air. The heart sounds were rapid and with no murmurs. CK-MB was 10.9 ng/ml, CK was 89 U/l, CK index was 12.2% and Troponin T was 0.40 ng/ml. An EKG showed sinus regular tachycardia of 119 bpm, with ST-segment elevations up to 3 mm in V1, V2, and V3. A chest radiograph showed low lung volumes and right basilar subsegmental atelectasis. Coronary angiography showed no stenosis or clinically significant disease. Left ventriculography revealed akinesis of the anterior wall, hypokinesis of the apical and distal inferior walls, and compensatory hyperkinesis of t

stabbing substernal chest pain chest pain stabbing substernal chest pain dementia hypertension renalartery stenosis chronic renal insufficiency hypercholesterolemia osteoporosis dementia oxygen saturation Blood pressure respiratory rate oxygen saturation murmurs sounds troponin t CKMB CK CK index Troponin T st  segment elevations An EKG STsegment elevations right basilar subsegmental atelectasis A chest radiograph low lung volumes right basilar subsegmental atelectasisly significant disease Coronary angiography compensatory hyperkinesis of the basal anterior and basal inferior walls Left ventriculography akinesis of the anterior wall compensatory hyperkinesis of the basal anterior and basal inferior walls anterosept A transthoracic echocardiogram severe segmental left ventricular dysfunction mild mitral regurgitation The overall left ventricular systolic function mildly impaired mild mitral regurgitation 
        
stabbing substernal chest pain chest pain stabbing substernal chest pain

## Identify Negated using NegBert and Family History and Temporal/historical Information using the ConText algorithm

Models:
1. NERs - NERs -- MODEL samrawal/bert-base-uncased_clinical-ner [Problem, treatment, test]
2. NER - using Med spacy
3. Med_spacy for applying ConText and categorizing the not_negated entities


1. Input: Raw Topics
2. Output: A list of unique tokens that are not_negated(NEG_BERT) and then can be either related to the family or not, either historical or not

Steps: 
1. Split the topic into sentences using spacy sci model
2. Get entities with the NERs Bert model
3. Remove identified negated sentences/NERs
4. Annotate the not_negated entities with the ConText Algorithm (family/historical)

In [None]:
def identify_notnegated_ners_NegBERT_apply_context(topic: str, extracted_NER: list ,extracted_information: list,keep_noNER_sent):
  #Split the topic into sentences using the en_core_sci_sm spacy model
  sentences = split_sent(topic,nlp_sentence)
  nlp_context = medspacy.load()
  list_of_terms_present = []
  #family history
  list_of_terms_family = []
  #historical
  list_of_terms_historical = []
  #Not_negated, not family, not historical
  list_of_terms_000 = []
  #Not_negated,family, not historical
  list_of_terms_010 = []
  #Not_negated, not family history, historical
  list_of_terms_001 = []
  #Not_negated, family history, historical
  list_of_terms_011 = []

  #Remove Fam
  remove_fam = []
  #Remove hist 
  remove_hist = []
  #Initiate list of terms 
  not_negated_sentences = []
  sentences_with_no_entities = []
  topic_entities = []

  for sentence in sentences:
    #Use bert-base-uncased_clinical-ner to extract [Problem, treatment, test]
    topic_ent = get_entities(sentence)
    #Add the required tokens in the segment to identify Negated using NegBert
    ready_sentences = modify_segment_negation(str(sentence), topic_ent,extracted_NER)
    classification,topic_ent  = get_negations(ready_sentences, model2, tokenizer2, topic_ent)

    #Creates list of sentences that do not contain identified entities
    if not topic_ent: 
      sentences_with_no_entities.append(sentence)
      
    #Create list for each entity, based on their identified type
    for el in zip(topic_ent,classification):
      if str(el[1]['label']) == 'PRESENT':
        # print('Present term',str(el[0]['word']))
        list_of_terms_present.append(str(el[0]['word'])) 
        not_negated_sentences.append(sentence)
        #Apply context in this sentence to identify family and history
        #For each sentence, the entity is categorized as negated, family or historical
        ready_sentences_con = apply_context_med_entities(str(sentence), topic_ent,nlp_context)
        topic_entities.append(ready_sentences_con)
        

       #Create the query based on the entitie type:
        for sent_entities in topic_entities:
          for entity in sent_entities:
          #Remove family
            if entity[2] == False:
              remove_fam.append(str(entity[0]))         
            #Remove history 
            if entity[3] == False:
              remove_hist.append(str(entity[0])) 
            #Creates list with all family related information
            if entity[2] == True:
              list_of_terms_family.append(str(entity[0]))
            #Creates list with all historical information
            if entity[3] == True:
              list_of_terms_historical.append(str(entity[0]))
            #Creates list with not_negated, not family history and not historical information [combinations]
            if entity[2] == False and entity[3] == False:
              list_of_terms_000.append(str(entity[0]))
            #Creates list with not_negated, family history and not historical information [combinations]
            if entity[2] == True and entity[3] == False:
              list_of_terms_010.append(str(entity[0]))
            #Creates list with not_negated, not family history and historical information [combinations]
            if entity[2] == False and entity[3] == True:
              list_of_terms_001.append(str(entity[0]))
            #Creates list with not_negated, family history and historical information [combinations]
            if entity[2] == True and entity[3] == True:
              list_of_terms_011.append(str(entity[0]))

  final_remove_fam = '|'.join(remove_fam).replace('|##', '').split('|')
  final_remove_hist = '|'.join(remove_hist).replace('|##', '').split('|')
  final_ners_fam = '|'.join(list_of_terms_family).replace('|##', '').split('|')
  final_ners_hist = '|'.join(list_of_terms_historical).replace('|##', '').split('|')
  final_ners_000 = '|'.join(list_of_terms_000).replace('|##', '').split('|')
  final_ners_010 = '|'.join(list_of_terms_010).replace('|##', '').split('|')
  final_ners_001 = '|'.join(list_of_terms_001).replace('|##', '').split('|')
  final_ners_011 = '|'.join(list_of_terms_011).replace('|##', '').split('|')

  # print('present',final_ners_present,'\n')
  # print('family',final_ners_fam,'\n')
  # print('historical',final_ners_hist,'\n')
  # print('present,not_fam,not_hist',final_ners_000,'\n')
  # print('present,fam,not_hist',final_ners_010,'\n')
  # print('present,not_fam,hist',final_ners_001,'\n')
  # print('present,fam,hist',final_ners_011,'\n')

  #Returns all not_negated NERs - Problems, treatments, tests - except from that related to historical patient information
  if extracted_information[1] == 'remove_family'and extracted_information[2] == 'None':
    if not final_remove_fam[0]:
      print('No family history.')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_remove_fam+sentences_with_no_entities))
      else: 
          return list(set(final_remove_fam))

  #Returns all not_negated NERs - Problems, treatments, tests - except from that related to family
  if extracted_information[1] == 'None' and extracted_information[2] == 'remove_historical_information':
    if not final_remove_hist[0]:
      print('No historical history.')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_remove_hist+sentences_with_no_entities))
      else: 
          return list(set(final_remove_hist))

  #Returns all family_related NERs - Problems, treatments, tests
  if extracted_information[1] == 'family_history'and extracted_information[2] == 'None':
    if not final_ners_fam[0]:
      print('No family history.')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_fam+sentences_with_no_entities))
      else: 
          return list(set(final_ners_fam))
  #Returns all historical NERs - Problems, treatments, tests
  if extracted_information[1] == 'None'and extracted_information[2] == 'historical_information':
    if not final_ners_hist[0]:
      print('No past medical history.')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_hist+sentences_with_no_entities))
      else: 
        return list(set(final_ners_hist))
  #Returns all not_negated, not_family,not_historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[1] == 'not_family_history'and extracted_information[2] == 'not_historical_information':
    if not final_ners_000[0]:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_000+sentences_with_no_entities))
      else: 
          return list(set(final_ners_000))
  #Returns all not_negated, family NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[1] == 'family_history' and extracted_information[2] == 'not_historical_information':
    if not final_ners_010:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_010+sentences_with_no_entities))
      else: 
          return list(set(final_ners_010))
  #Returns all not_negated, not_family, historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if extracted_information[1] == 'not_family_history'and extracted_information[2] == 'historical_information':
    if not final_ners_001:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_001+sentences_with_no_entities))
      else: 
          return list(set(final_ners_001))
  #Returns all not_negated, family, historical NERs - Problems, treatments, tests [Q = Present Information related to the patient]
  if (extracted_information[1] == 'family_history' and extracted_information[2] == 'historical_information'):
    if not final_ners_011:
      print('Empty query, not identified NERS')
      return None
    else:
      if keep_noNER_sent:
          return list(set(final_ners_011+sentences_with_no_entities))
      else: 
          return list(set(final_ners_011))

display(desc_tr['query'][33])
print('not_negated(BERT) remove family')
print(create_adhoc(identify_notnegated_ners_NegBERT_apply_context(desc_tr['query'][33], ['problem','treatment','test'] ,['','remove_family','None'],False)))
print('\n not_negated(BERT) remove history')
print(create_adhoc(identify_notnegated_ners_NegBERT_apply_context(desc_tr['query'][33], ['problem','treatment','test'] ,['','None','remove_historical_information'],False)))
print('\n not_negated(BERT) remove family and history')
print(create_adhoc(identify_notnegated_ners_NegBERT_apply_context(desc_tr['query'][33], ['problem','treatment','test'] ,['','not_family_history','not_historical_information'],False)))


'An 82-year-old woman comes to the emergency department because of chest pain and shortness of breath after being awakened in the morning by stabbing substernal chest pain radiating to the left shoulder and jaw. The patient had hypertension, renal-artery stenosis with chronic renal insufficiency, hypercholesterolemia, osteoporosis and dementia. Blood pressure was 199/108 mm Hg, respiratory rate 18 bpm, oxygen saturation 98% on ambient air. The heart sounds were rapid and with no murmurs. CK-MB was 10.9 ng/ml, CK was 89 U/l, CK index was 12.2% and Troponin T was 0.40 ng/ml. An EKG showed sinus regular tachycardia of 119 bpm, with ST-segment elevations up to 3 mm in V1, V2, and V3. A chest radiograph showed low lung volumes and right basilar subsegmental atelectasis. Coronary angiography showed no stenosis or clinically significant disease. Left ventriculography revealed akinesis of the anterior wall, hypokinesis of the apical and distal inferior walls, and compensatory hyperkinesis of t

not_negated(BERT) remove family
sounds mild mitral regurgitation Coronary angiography CK compensatory hyperkinesis of the basal anterior and basal inferior walls CK index st  segment elevations low lung volumes akinesis of the anterior wall dementia CKMB A transthoracic echocardiogram respiratory rate hypercholesterolemia Left ventriculography severe segmental left ventricular dysfunction renalartery stenosis murmurs chronic renal insufficiency oxygen saturation Blood pressure osteoporosis right basilar subsegmental atelectasisly significant disease anterosept A chest radiograph An EKG The overall left ventricular systolic function stabbing substernal chest pain STsegment elevations chest pain hypertension Troponin T right basilar subsegmental atelectasis troponin t mildly impaired

 not_negated(BERT) remove history
sounds mild mitral regurgitation Coronary angiography CK compensatory hyperkinesis of the basal anterior and basal inferior walls CK index st  segment elevations low lung v

## Identify Negated using NegBert and Context

Models:
1. NERs - NERs -- MODEL samrawal/bert-base-uncased_clinical-ner [Problem, treatment, test]
2. NER - using Med spacy
3. Med_spacy for applying ConText and categorizing the not_negated entities


1. Input: Raw Topics
2. Output: list with not_negated NERS, (identified by both methods)

Steps: 
1. Split the topic into sentences using spacy sci model
2. Get entities with the NERs Bert model
3. Remove identified negated sentences/NERs
4. Annotate the not_negated entities with the ConText Algorithm (family/historical)

In [None]:
def identify_notnegated_ners_NegBERT_context(topic: str,clasify_information: str, extracted_NER: list, keep_noNER_sent: bool):
  #Split the topic into sentences using the en_core_sci_sm spacy model
  sentences = split_sent(topic,nlp_sentence)
  nlp_context = medspacy.load()
  #Initiate list of terms 
  list_of_terms_present = []
  sentences_with_no_entities = []

  topic_entities = []
  #Not_negated,
  list_of_terms_present_context = []

  for sentence in sentences:
    #Use bert-base-uncased_clinical-ner to extract [Problem, treatment, test]
    topic_ent = get_entities(sentence)
    #Add the required tokens in the segment to identify Negated using NegBert
    ready_sentences = modify_segment_negation(str(sentence), topic_ent,extracted_NER)
    classification,topic_ent  = get_negations(ready_sentences, model2, tokenizer2, topic_ent)
    
    #Apply Context
    #For each sentence, the entity is categorized as negated, family or historical
    ready_sentences_con = apply_context_med_entities(str(sentence), topic_ent,nlp_context)
    topic_entities.append(ready_sentences_con)

    #Keep sentences that have no identified NER
    if not topic_ent:
      sentences_with_no_entities.append(str(sentence))

    #Create list for each entity, based on their identified type by NegBert
    for el in zip(topic_ent,classification):
      if str(el[1]['label']) == 'PRESENT':
        list_of_terms_present.append(str(el[0]['word'])) 
  

  final_ners_present = '|'.join(list_of_terms_present).replace('|##', '').split('|')

  #Create the query based on the entitie type identified by ConTEXT:
  for sent_entities in topic_entities:
    for entity in sent_entities:
      #Creates list with not_negated information
      if entity[1] == False: 
        list_of_terms_present_context.append(str(entity[0]))
  
  list_of_terms_present = set(list_of_terms_present)
  list_of_terms_present_context = set(list_of_terms_present_context)
  not_negated_NERs = list(list_of_terms_present.intersection(list_of_terms_present_context))

  # print(list_of_terms_present)
  # print(list_of_terms_present_context)
  # print(not_negated_NERs)

  #returns the requested extracted NERS
  #returns only the present problems,treatments or test
  if not final_ners_present[0]: 
    print('No information.')
    return None
  else:
    if keep_noNER_sent:
      return (not_negated_NERs+sentences_with_no_entities)
    else: 
      return not_negated_NERs
  
          
display(desc_tr['query'][33])
print('Not Negated NERS by both methods + Sentences without NERS \n') #keep_noNER_sent: bool, remove_negated_NERS: bool
display(create_adhoc(identify_notnegated_ners_NegBERT_context(desc_tr['query'][33],'present_ners',['problem','treatment','test'],True)))
print('Not Negated NERS by both methods \n')
display(create_adhoc(identify_notnegated_ners_NegBERT_context(desc_tr['query'][33],'present_ners',['problem','treatment','test'],False)))



'An 82-year-old woman comes to the emergency department because of chest pain and shortness of breath after being awakened in the morning by stabbing substernal chest pain radiating to the left shoulder and jaw. The patient had hypertension, renal-artery stenosis with chronic renal insufficiency, hypercholesterolemia, osteoporosis and dementia. Blood pressure was 199/108 mm Hg, respiratory rate 18 bpm, oxygen saturation 98% on ambient air. The heart sounds were rapid and with no murmurs. CK-MB was 10.9 ng/ml, CK was 89 U/l, CK index was 12.2% and Troponin T was 0.40 ng/ml. An EKG showed sinus regular tachycardia of 119 bpm, with ST-segment elevations up to 3 mm in V1, V2, and V3. A chest radiograph showed low lung volumes and right basilar subsegmental atelectasis. Coronary angiography showed no stenosis or clinically significant disease. Left ventriculography revealed akinesis of the anterior wall, hypokinesis of the apical and distal inferior walls, and compensatory hyperkinesis of t

Not Negated NERS by both methods + Sentences without NERS 



'sounds mild mitral regurgitation compensatory hyperkinesis of the basal anterior and basal inferior walls st  segment elevations low lung volumes akinesis of the anterior wall dementia respiratory rate hypercholesterolemia severe segmental left ventricular dysfunction chronic renal insufficiency oxygen saturation osteoporosis anterosept stabbing substernal chest pain chest pain hypertension right basilar subsegmental atelectasis troponin t mildly impaired \n        '

Not Negated NERS by both methods 



'sounds mild mitral regurgitation compensatory hyperkinesis of the basal anterior and basal inferior walls st  segment elevations low lung volumes akinesis of the anterior wall dementia respiratory rate hypercholesterolemia severe segmental left ventricular dysfunction chronic renal insufficiency oxygen saturation osteoporosis anterosept stabbing substernal chest pain chest pain hypertension right basilar subsegmental atelectasis troponin t mildly impaired'

## Expand NER using UMLS concepts

Models:
1. NER with BERT
2. Expansion with UMLS

The model expects input in the form of **spans/sentences with one marked entity to classify** as PRESENT(0), ABSENT(1) or POSSIBLE(2). The entity in question is identified with the special token [entity] surrounding it.

1. Input: NER entity string or topic string, a nlp pipeline with umls expansion, a list with the expansion text ['aliases','definition','code','TUI']
2. Output: Expanede terms [definition, alliaces etc.]


In [None]:
def expand_entity(topic,nlp_umls, expand_with: list):
  expanded_topic = []
  # print(topic)
  if topic:
    doc = nlp_umls(topic)
    for entity in doc.ents:
      for umls_ent in entity._.kb_ents:
        #Add the entity and its official name
        # print(entity,'-->',umls_ent)
        # print(linker.kb.cui_to_entity[umls_ent[0]][1])
        expanded_topic.append(str(entity))
        expanded_topic.append(str(linker.kb.cui_to_entity[umls_ent[0]][1]))

        # Expand with the aliases
        if 'aliases' in expand_with:
          # print(linker.kb.cui_to_entity[umls_ent[0]][2])
          if linker.kb.cui_to_entity[umls_ent[0]][2]: 
            expanded_topic.append(' '.join(linker.kb.cui_to_entity[umls_ent[0]][2]))

        if 'definition' in expand_with: 
          # print(linker.kb.cui_to_entity[umls_ent[0]][4])
          if linker.kb.cui_to_entity[umls_ent[0]][4]: 
            expanded_topic.append(str(linker.kb.cui_to_entity[umls_ent[0]][4]))

        if 'code' in expand_with:
          # print(linker.kb.cui_to_entity[umls_ent[0]][0])
          if linker.kb.cui_to_entity[umls_ent[0]][0]: 
            expanded_topic.append(str(linker.kb.cui_to_entity[umls_ent[0]][0]))

        if 'TUI' in expand_with:
          # print(linker.kb.cui_to_entity[umls_ent[0]][3])
          if linker.kb.cui_to_entity[umls_ent[0]][3]: 
            expanded_topic.append(str(linker.kb.cui_to_entity[umls_ent[0]][3]))

    return ' '.join(expanded_topic).translate(str.maketrans(' ', ' ', string.punctuation))

    if not expanded_topic:
      display('Topic not expanded.Returned the original.')
      return topic

expand_entity(desc_tr['query'][33],nlp_umls,['aliases','definition','code','TUI'])

'woman comes Woman female humans Girl Human Females Girls Woman women human female female woman Women female human WOMAN adult female Woman person Human females as cultural psychological sociological political and economic entities C0043210 T098 emergency department Accident and Emergency department EMERGENCY ROOM emergency service hospital Accident and Emergency Department A  E  Accident and Emergency Department Hospital Services Emergency Service Emergency Hospital Service Emergency Hospital Units Emergency Hospital Emergency Services Rooms Emergency Room Emergency Emergency Room Emergency Emergency Departments emergency rooms Accident and Emergency department emergencies room Accident and Emergency department environment emergency department Hospital Service Emergency emergency hospital service Unit Emergency Hospital Emergency Service Emergency Service Hospital Service Emergencies Hospital AED  Accident and Emergency department Emergency Hospital Services Services Emergency Hospita

# Creation of the various query representations used in the paper

Each cell outputs a dataframe (in .txt) with the columns: qid, query. This .txt is ready to be used for retrieval with PyTerrier


## Creates Q1 to Q4 (by chaning the entity extract list) and Q10 (by setting the bool values to True, True): 

Returns a query with NERs extracted with BERT.
Returns a query with not negated NERs extracted with BERT and NegBERT
Returns a query witn not negated NERs and with the sentences with no NERs [True]

In [None]:
experiment = 'Q10bert_All_not_negated_NERs_keep_Sentences'
extract = ['problem','treatment','test']
filename = experiment+'.csv'
save_path = save_reform_queries+ filename

#Keep sentences that do not contain any medical NER (keep_noNER_sent=True)
#Remove_negated_NERS (remove_negated_NERS=True)
desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',extract,keep_noNER_sent=True,remove_negated_NERS=True))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)

Unnamed: 0,qid,query,Q10bert_All_not_negated_NERs_keep_Sentences
0,20141,A 58-year-old African-American woman presents ...,episodic pressing burning anterior chest pain ...
1,20142,An 8-year-old male presents in March to the ER...,fever dyspnea cough fever cough loose stools e...
2,20143,A 58-year-old nonsmoker white female with mild...,mild exertional dyspnea occasional cough a lef...
3,20144,A 2-year-old boy is brought to the emergency d...,high fever irritability the physical exam conj...
4,20145,A 56-year-old female on 20th day post-left mas...,post left mastectomy shortness of breath mala...
5,20146,64-year-old obese female with diagnosis of dia...,diabetes mellitus persistently elevated hba1c ...
6,20147,A 26-year-old obese woman with a history of bi...,bipolar disorder her recent struggles with her...
7,20148,A 62-year-old man sees a neurologist for progr...,progressive memory loss jerking movements of t...
8,20149,A 43-year-old woman visits her dermatologist f...,lesions on her neck examination multiple lesio...
9,201410,A physician is called to see a 67-year-old wom...,cardiac catheterization a cool right foot exam...


## Creates Q9: Remove negated NERs with Context and Neg_BERT

In [None]:
experiment = 'Q9_bert_problems_treatments_test_notnegated_bothmethods'
extracted = ['problem','treatment','test'] # Entities to be extracted
filename = experiment+'.csv'
save_path = save_reform_queries+ filename

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_notnegated_ners_NegBERT_context(x,'present_ners',extracted,keep_noNER_sent=False))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)

## Q12 - Q14: All not_negated NERs with BERT, Family history and historical with ConText

In [None]:
experiment = 'Q12_All_notnegated_NER_Remove_Family_Historical'
desc_tr[experiment] = desc_tr['query'].apply(lambda x: create_adhoc(identify_notnegated_ners_NegBERT_apply_context(x, ['problem','treatment','test'] ,['','not_family_history','not_historical_information'],False)))
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr[['qid',experiment]].to_csv(save_path)

experiment = 'Q13_All_notnegated_NER_Remove_Family'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr[experiment] = desc_tr['query'].apply(lambda x: create_adhoc(identify_notnegated_ners_NegBERT_apply_context(x, ['problem','treatment','test'] ,['','remove_family','None'],False)))
desc_tr[['qid',experiment]].to_csv(save_path)

experiment = 'Q14_All_notnegated_NER_Remove_historical'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr[experiment] = desc_tr['query'].apply(lambda x: create_adhoc(identify_notnegated_ners_NegBERT_apply_context(x, ['problem','treatment','test'] ,['','None','remove_historical_information'],False)))
desc_tr[['qid',experiment]].to_csv(save_path)


## Creates: 
1. Query (Q6): Merge all NERs with the Dosages and Drugs

2. Query (Q11): Merge all not_negated NERs BERT with the Dosages and Drugs. Set the bool values to: [True, True]

In [None]:
#Save it
selected_extraction_entities = dis_chem_stanza # Add the dataframe with the medical entities default dis_chem_stanza
experiment = 'Q11_bert_notnegated_NERs_sentences_add_drugs_dosages_dis_chem_stanza'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)
desc_tr['qid'] = desc_tr['qid'].astype(str)

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',['problem','treatment','test'],keep_noNER_sent=True,remove_negated_NERS=True))))
desc_tr = pd.merge(desc_tr, selected_extraction_entities, on=["qid"],how='left')

desc_tr[experiment] = desc_tr[experiment].str.cat(desc_tr['query_y'], sep=' ')
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Creates a query that contains the NOT negated (present) identified NERS [problems, treatments and test] of a patient. 


Use bert_clinical and neg+bert to identify not negated problems, treatments, and tests.

> 1. NER - bert-base-uncased_clinical-ner to identify ['problem', 'treatment','test']
> 2. Negated NERs - bvanaken/clinical-assertion-negation-bert 

In [None]:
experiment = 'bert_problems_treat_test_not_negated_Neg_BERT'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',['problem','treatment','test'],keep_noNER_sent=False,remove_negated_NERS=True))))
display(desc_tr)
desc_tr[['qid',experiment]].to_csv(save_path)

0     Patient is a 45-year-old man with a history o...
1     48 M with a h/o HTN hyperlipidemia, bicuspid ...
2     A 32 yo woman who presents following a severe...
3     This is a 44 year old female with PMH of PCOS...
4     74M hx of CAD s/p CABG, EF 60% prior CVA (no ...
Name: query, dtype: object

Unnamed: 0,qid,query,bert_problems_treat_test_not_negated_Neg_BERT
0,1,Patient is a 45-year-old man with a history o...,anaplastic astrocytoma of the spine severe low...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid ...",htn hyperlipidemia bicuspid aortic valve progr...
2,3,A 32 yo woman who presents following a severe...,a severe exploding headache a sudden exp...
3,4,This is a 44 year old female with PMH of PCOS...,pcos obesity htn cholecystitis a large pericar...
4,5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no ...",cad cabg ef htn hl dmii moderate to severe pvd...
...,...,...,...
70,71,The patient is a 34-year-old obese woman who ...,weight concerns weight bmi antiobesity agents ...
71,72,The patient is a 16-year-old girl recently di...,myasthenia gravis diplopia weakness positive a...
72,73,The patient is a 3-day-old female infant with...,jaundice an incubator vital signs axillary tem...
73,74,The patient is a 53-year-old man complaining ...,frequent headaches generalized bone pain diffi...


## Query: Creates a query that contains the NOT negated (present) identified NERS [problems] of a patient. 


Use bert_clinical and neg+bert to identify not negated problems, treatments, and tests.

> 1. NER - bert-base-uncased_clinical-ner to identify ['problem', 'treatment','test']
> 2. Negated NERs - bvanaken/clinical-assertion-negation-bert 

In [None]:
experiment = 'bert_treat_not_negated_Neg_BERT'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',['problem','',''],keep_noNER_sent=False,remove_negated_NERS=True))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Creates a query that contains the NOT negated (present) identified NERS [treatments] of a patient. 


Use bert_clinical and neg+bert to identify not negated problems, treatments, and tests.

> 1. NER - bert-base-uncased_clinical-ner to identify ['problem', 'treatment','test']
> 2. Negated NERs - bvanaken/clinical-assertion-negation-bert 

In [None]:
experiment = 'bert_problems_not_negated_Neg_BERT'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',['','treatment',''],keep_noNER_sent=False,remove_negated_NERS=True))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Creates a query that contains the NOT negated (present) identified NERS [tests] of a patient. 


Use bert_clinical and neg+bert to identify not negated problems, treatments, and tests.

> 1. NER - bert-base-uncased_clinical-ner to identify ['problem', 'treatment','test']
> 2. Negated NERs - bvanaken/clinical-assertion-negation-bert 

In [None]:
experiment = 'bert_test_not_negated_Neg_BERT'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = desc_tr['query'].apply(lambda x: (create_adhoc(identify_ners(x,'present_ners',['test','',''],keep_noNER_sent=False,remove_negated_NERS=True))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
1. are NOT negated (i.e., the present) information, 

Identified using Context algorithm and the BERT uncased_clinical. 



In [None]:
experiment = 'context_problems_treat_test_not_negated_ners'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x,['not_negated','None','None'],keep_noNER_sent=False))))
display(desc_tr)

desc_tr[['qid',experiment]].to_csv(save_path)


## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
2. that is related to the patient's family, 

Identified using Context and the BERT uncased_clinical. 


In [None]:
experiment = 'context_problems_treat_test_family'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['None','family_history','None'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
2. that is historical information, 

Identified using Context and the BERT uncased_clinical. 

In [None]:
experiment = 'context_problems_treat_test_historical'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['None','None','historical_information'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)

## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
1. Not negated (i.e., present)
2. that is not_family (i.e., patient related)
3. not historical information (i.e, active currently), 

I.e., all not negated patient only information. 

Identified using Context and the BERT uncased_clinical. 

In [None]:
experiment = 'context_problems_treat_test_historical'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['not_negated','not_family_history','not_historical_information'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)


## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
1. Not negated (i.e., present)
2. family history (i.e., related to the family history)
3. not historical information (i.e, active currently), 

Identified using Context and the BERT uncased_clinical. 

In [None]:
experiment = 'context_problems_treat_test_not_negated_fam_nothist'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)


desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['not_negated','family_history','not_historical_information'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)


Unnamed: 0,qid,query,bert_problems_treat_test_not_negated_Neg_BERT,context_problems_treat_test_not_negated_ners,context_problems_treat_test_family,context_problems_treat_test_historical,context_problems_treat_test_not_negated_notfam_nothist,context_problems_treat_test_not_negated_fam_nothist
0,1,Patient is a 45-year-old man with a history o...,anaplastic astrocytoma of the spine severe low...,chronic pain severe lower extremity weakness F...,,chronic pain severe lower extremity weakness F...,radiation The tumor unresectable anaplastic as...,
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid ...",htn hyperlipidemia bicuspid aortic valve progr...,le edema h bicuspid aortic valve progressive S...,,bicuspid aortic valve progressive SOB LE edema,le edema hning lv function TTE severe aortic s...,


## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
1. Not negated (i.e., present)
2. not family history (i.e., related to the family history)
3. historical information (i.e, active currently), 

Identified using Context and the BERT uncased_clinical. 

In [None]:
experiment = 'context_problems_treat_test_not_negated_fam_nothist'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['not_negated','not_family_history','historical_information'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)


Unnamed: 0,qid,query,bert_problems_treat_test_not_negated_Neg_BERT,context_problems_treat_test_not_negated_ners,context_problems_treat_test_family,context_problems_treat_test_historical,context_problems_treat_test_not_negated_notfam_nothist,context_problems_treat_test_not_negated_fam_nothist,context_problems_treat_test_not_negated_notfam_hist
0,1,Patient is a 45-year-old man with a history o...,anaplastic astrocytoma of the spine severe low...,chronic pain severe lower extremity weakness F...,,chronic pain severe lower extremity weakness F...,radiation The tumor unresectable anaplastic as...,,chronic pain severe lower extremity weakness F...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid ...",htn hyperlipidemia bicuspid aortic valve progr...,le edema h bicuspid aortic valve progressive S...,,bicuspid aortic valve progressive SOB LE edema,le edema hning lv function TTE severe aortic s...,,bicuspid aortic valve progressive SOB LE edema


## Query: Create a query that contains NERs (identified with BERT_uncased) --> [problems, treatments, tests] that: 
1. Not negated (i.e., present)
2. family history (i.e., related to the family history)
3. historical information (i.e, active currently), 

Identified using Context and the BERT uncased_clinical. 

In [None]:
experiment = 'context_problems_treat_test_not_negated_fam_hist'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: create_adhoc(apply_context(x, ['not_negated','family_history','historical_information'],keep_noNER_sent=False))))
display(desc_tr.head(2))

desc_tr[['qid',experiment]].to_csv(save_path)

# Query expansion using UMLS 

## Expansion of Not_negated,not_related_family,not_historical with aliases, codes and TUIS

In [None]:
experiment = 'context_expanded_aliases_not_negated_notfam_nothist'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: expand_entity(create_adhoc(apply_context(x, ['not_negated','not_family_history','not_historical_information'],keep_noNER_sent=False)),nlp_umls,['aliases','None','code','TUI'])))
display(desc_tr.head(2))

desc_tr[['qid','context_expanded_aliases_not_negated_notfam_nothist']].to_csv('context_expanded_aliases_not_negated_notfam_nothist.csv')
!cp context_expanded_aliases_not_negated_notfam_nothist.csv "./Created_Queries/TREC/context_expanded_aliases_not_negated_notfam_nothist.csv"

Unnamed: 0,qid,query,bert_problems_treat_test_not_negated_Neg_BERT,context_problems_treat_test_not_negated_ners,context_problems_treat_test_family,context_problems_treat_test_historical,context_problems_treat_test_not_negated_notfam_nothist,context_problems_treat_test_not_negated_fam_nothist,context_problems_treat_test_not_negated_notfam_hist,context_problems_treat_test_not_negated_fam_hist,context_expanded_aliases_not_negated_notfam_nothist
0,1,Patient is a 45-year-old man with a history o...,anaplastic astrocytoma of the spine severe low...,chronic pain severe lower extremity weakness F...,,chronic pain severe lower extremity weakness F...,radiation The tumor unresectable anaplastic as...,,chronic pain severe lower extremity weakness F...,,radiation Electromagnetic Radiation wave radia...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid ...",htn hyperlipidemia bicuspid aortic valve progr...,le edema h bicuspid aortic valve progressive S...,,bicuspid aortic valve progressive SOB LE edema,le edema hning lv function TTE severe aortic s...,,bicuspid aortic valve progressive SOB LE edema,,edema Edema dropsy Edematous EDEMAS Edema find...


## Expansion of Not_negated,not_related_family,not_historical with aliases, codes,definition and TUIS

In [None]:
experiment = 'context_expanded_aliases_def_not_negated_notfam_nothist'
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

desc_tr[experiment] = (desc_tr['query'].apply(lambda x: expand_entity(create_adhoc(apply_context(x, ['not_negated','not_family_history','not_historical_information'],keep_noNER_sent=False)),nlp_umls,['aliases','definition','code','TUI'])))
display(desc_tr.head(2))

desc_tr[['qid',filename]].to_csv(save_path)

## Load an already created adhoc representation and expand it with UMLS.

In [None]:
load_topics = 'Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza' #Q13_All_notnegated_NER_Remove_Family
filename = load_topics+'.csv'
read_file = save_reform_queries + filename
#The one conducted here:
experiment = 'Q17_expanded_allias_def_bert_NERs_add_drugs_dosages_dis_chem_stanza'  #Q13_All_notnegated_NER_Remove_Family
filename = experiment+'.csv'
save_path = save_reform_queries+ filename
desc_tr = load_topics(path_to_topics,collection)

#Read queries to be expanded:
Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza = pd.read_csv(read_file)
# Q13_All_notnegated_NER_Remove_Family = pd.read_csv(read_file)
display(Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza)

Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza[experiment] = Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza['Q6_bert_NERs_add_drugs_dosages'].apply(lambda x: expand_entity(x,nlp_umls,['aliases','definition','code','TUI']))
display(Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza.head(2))
Q06_bert_NERs_add_drugs_dosages_dis_chem_stanza[['qid',experiment]].to_csv(save_path)
