<a href="https://colab.research.google.com/github/HuyenNguyenHelen/MyTREC_2021/blob/main/BioBERT_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading processed query and documents 
- processed query file: given by either using ngram tokenization, metamap extraction, keyword extraction, named entity recognition
- processed documents given by either metamap extraction, keyword extraction, named entity recognition

#### Query: keyword expansion
#### Doc: ngram tokenization


In [None]:
import pandas as pd

In [None]:
with open (r'/content/PRF_kwExtraction_Query2016_1-3gram.csv', 'r', encoding = 'cp1252') as f:
  queries = pd.read_csv(f)
queries.head(3)

Unnamed: 0.1,Unnamed: 0,queryID,summary,summary_keyword,description,description_keyword,note,note_keyword
0,0,1,A 78 year old male presents with frequent stoo...,"{'male': 0.29736558256021506, 'presents': 0.29...",78 M transferred to nursing home for rehab aft...,"{'approximately': 0.3881970960906714, 'melanot...",78 M w/ pmh of CABG in early [**Month (only...,"{'nursing': 0.16048483002786335, 'home': 0.160..."
1,1,2,An elderly female with past medical history of...,"{'elderly': 0.15831692877998726, 'female': 0.1...",An elderly female with past medical history of...,"{'elderly': 0.16383273847958243, 'female': 0.1...",Ms [**Known patient lastname 241**] is a [*...,"{'hyperlipidemia': 0.14664469725594667, 'Ortho..."
2,2,3,A 75F found to be hypoglycemic with hypotensio...,"{'leukocytosis': 0.5590855488092952, 'creatini...","A 75F with a PMHx significant for severe PVD, ...",{'hypotension and confusion': 0.18857126108325...,Pt is a 75F with a PMHx significant for sev...,"{'unresponsive at home': 0.16805088855153935, ..."


In [None]:
queries.shape

(30, 8)

In [None]:
with open (r'/content/ngram_token_brief_titles.csv', 'r', encoding = 'utf-8') as f:
  docs = pd.read_csv(f)
docs.head(3)

Unnamed: 0.1,Unnamed: 0,queryID,brief_title,ngrams_tokens
0,0,1,Dabrafenib and Trametinib in Treating Patients...,"['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
1,1,2,Dabrafenib and Trametinib in Treating Patients...,"['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
2,2,3,Functionality of an 8-Channel Paddle Coil for ...,"['Functionality', 'of', 'an', '8-Channel', 'Pa..."


In [None]:
docs.shape

(40, 4)

In [None]:
!pip install biobert-embedding==0.1.2




In [None]:
from biobert_embedding.embedding import BiobertEmbedding
from sklearn.metrics.pairwise import cosine_similarity

def bioBERT_topsim (query_term, doc_terms,k):
    cos_sim = {}
    query_vec = bioBERTfit(query_term)
    for item in doc_terms:
        cos_sim[item] = cal_cosine_sim(bioBERTfit(item), query_vec)
    top_sim_terms = most_similar (cos_sim, k)
    return top_sim_terms
        
def bioBERTfit(word):
    biobert = BiobertEmbedding()
    vec_w = biobert.word_vector(word)
    return vec_w
    
def cal_cosine_sim(single_vec_query, sing_vec):
    cosine_sim = cosine_similarity(single_vec_query[0].reshape(1, -1),sing_vec[0].reshape(1, -1))
    return cosine_sim

def most_similar(dic, k_):
    # Sort the given array arr in reverse order.   
    # Print the first kth largest elements
    sort_dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse = True)[:k_]}
    return sort_dic
    



In [None]:
# Example
query = 'work'
doc_terms = ['job', 'sky', 'sweet tea', 'do', 'scientific papers', 'working', 'high performance']

bioBERT_topsim (query, doc_terms, 3)

{'do': array([[0.829417]], dtype=float32),
 'job': array([[0.8601637]], dtype=float32),
 'working': array([[0.88085926]], dtype=float32)}

#### Fitting model into the data

In [None]:
import ast
def convertFormat (txt):
  list_for = ast.literal_eval(txt)
  return list_for
#columns = ['queryID','summary_keyword','ngrams_tokens']
temp = [queries ['queryID'], queries['summary_keyword'], docs['ngrams_tokens'][:30]]
temp_df = pd.concat(temp,axis=1)
temp_df.head()


Unnamed: 0,queryID,summary_keyword,ngrams_tokens
0,1,"{'male': 0.29736558256021506, 'presents': 0.29...","['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
1,2,"{'elderly': 0.15831692877998726, 'female': 0.1...","['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
2,3,"{'leukocytosis': 0.5590855488092952, 'creatini...","['Functionality', 'of', 'an', '8-Channel', 'Pa..."
3,4,"{'woman': 0.15831692877998726, 'anxiety': 0.15...","['Patients', 'With', 'Refractory', 'Metastatic..."
4,5,"{'multiple': 0.15831692877998726, 'chronic': 0...","['HPV', 'Self-Test', 'Intervention', 'in', 'Oh..."


In [None]:
temp_df['summary_keyword'] = temp_df[['summary_keyword']].applymap(convertFormat) 
temp_df['ngrams_tokens'] = temp_df[['ngrams_tokens']].applymap(convertFormat) 

In [None]:
def topSimTerm(q,d,k):
  term_top_sim = {}
  query_terms = [i for i in q.keys() ]
  for term in query_terms[:3]:
    term_top_sim[term] = bioBERT_topsim (term, d[:6],k)
  return term_top_sim
  

In [None]:
q={'male': 0.29736558256021506, 'presents': 0.29736558256021506, 'frequent': 0.29736558256021506, 'stools': 0.29736558256021506, 'year': 0.15831692877998726, 'melena': 0.15831692877998726, 'male presents': 0.09700399286574239, 'presents with frequent': 0.09700399286574239, 'frequent stools': 0.09700399286574239, 'year old male': 0.04940384002065631, 'stools and melena': 0.04940384002065631}
d = [ 'Fotemustine', 'FTM', 'and', 'Ipilimumab', 'and', 'Nivolumab', 'in', 'Melanoma', 'Brain', 'Metastasis', 'Study', 'to', 'Determine', 'Tolerability', 'After', 'Intravenous', 'Administration', 'of', 'BIBN', '4096', 'BS', 'in', 'Healthy', 'Male', 'and', 'Female', 'Volunteers', 'Minoxidil', '2', 'Solution', 'and', 'Botanical']
k = 3
#topSimTerm(q,d,k)
term_top_sim = {}
query_terms = [i for i in q.keys()]
for term in query_terms:
#print (term)
  term_top_sim[term] = bioBERT_topsim (term, d,k)

In [None]:
term_top_sim


{'stools': {'BS': array([[0.8258561]], dtype=float32),
  'Solution': array([[0.81991756]], dtype=float32),
  'Volunteers': array([[0.83191717]], dtype=float32)}}

In [None]:
#temp_df['top_similar_terms'] = [topSimTerm(temp_df[['summary_keyword']],temp_df[['ngrams_tokens']], 5)]
temp_df['top_similar_terms']= temp_df.apply(lambda x: topSimTerm(x.summary_keyword,x.ngrams_tokens, 4), axis=1)

In [None]:
temp_df['top_similar_terms']

KeyError: ignored

In [None]:
df["Age + Weight"] = add(df["Age"], df["Weight"])

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
