<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/BioBERT_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading processed query and documents 
- processed query file: given by either using ngram tokenization, metamap extraction, keyword extraction, named entity recognition
- processed documents given by either metamap extraction, keyword extraction, named entity recognition

#### Query: keyword expansion
#### Doc: ngram tokenization


In [2]:
import pandas as pd

In [3]:
with open (r'/content/PRF_kwExtraction_Query2016_1-3gram.csv', 'r', encoding = 'cp1252') as f:
  queries = pd.read_csv(f)
queries.head(3)

Unnamed: 0.1,Unnamed: 0,queryID,summary,summary_keyword,description,description_keyword,note,note_keyword
0,0,1,A 78 year old male presents with frequent stoo...,"{'male': 0.29736558256021506, 'presents': 0.29...",78 M transferred to nursing home for rehab aft...,"{'approximately': 0.3881970960906714, 'melanot...",78 M w/ pmh of CABG in early [**Month (only...,"{'nursing': 0.16048483002786335, 'home': 0.160..."
1,1,2,An elderly female with past medical history of...,"{'elderly': 0.15831692877998726, 'female': 0.1...",An elderly female with past medical history of...,"{'elderly': 0.16383273847958243, 'female': 0.1...",Ms [**Known patient lastname 241**] is a [*...,"{'hyperlipidemia': 0.14664469725594667, 'Ortho..."
2,2,3,A 75F found to be hypoglycemic with hypotensio...,"{'leukocytosis': 0.5590855488092952, 'creatini...","A 75F with a PMHx significant for severe PVD, ...",{'hypotension and confusion': 0.18857126108325...,Pt is a 75F with a PMHx significant for sev...,"{'unresponsive at home': 0.16805088855153935, ..."


In [4]:
with open (r'/content/ngram_token_brief_titles.csv', 'r', encoding = 'utf-8') as f:
  docs = pd.read_csv(f)
docs.head(3)

Unnamed: 0.1,Unnamed: 0,queryID,brief_title,ngrams_tokens
0,0,1,Dabrafenib and Trametinib in Treating Patients...,"['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
1,1,2,Dabrafenib and Trametinib in Treating Patients...,"['Dabrafenib', 'and', 'Trametinib', 'in', 'Tre..."
2,2,3,Functionality of an 8-Channel Paddle Coil for ...,"['Functionality', 'of', 'an', '8-Channel', 'Pa..."


In [5]:
!pip install biobert-embedding==0.1.2


Collecting biobert-embedding==0.1.2
  Downloading https://files.pythonhosted.org/packages/d2/f0/f5bd3fd4a0bcef4d85e5e82347ae73d376d68dc8086afde75838ba0473a2/biobert-embedding-0.1.2.tar.gz
Collecting torch==1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/05/65/5248be50c55ab7429dd5c11f5e2f9f5865606b80e854ca63139ad1a584f2/torch-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (748.9MB)
[K     |████████████████████████████████| 748.9MB 13kB/s 
[?25hCollecting pytorch-pretrained-bert==0.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 33.0MB/s 
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/3f/77/cc19511d0fe4672890209ebcbfb9b3f4746572f5a48f7ed2654e7f8c2f29/boto3-1.17.89-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 59.3MB/s 
Collecting b

In [6]:
from biobert_embedding.embedding import BiobertEmbedding
from sklearn.metrics.pairwise import cosine_similarity

def bioBERT_topsim (query_term, doc_terms,k):
    cos_sim = {}
    query_vec = bioBERTfit(query_term)
    for item in doc_terms:
        cos_sim[item] = cal_cosine_sim(bioBERTfit(item), query_vec)
    top_sim_terms = most_similar (cos_sim, k)
    return top_sim_terms
        
def bioBERTfit(word):
    biobert = BiobertEmbedding()
    vec_w = biobert.word_vector(word)
    return vec_w
    
def cal_cosine_sim(single_vec_query, sing_vec):
    cosine_sim = cosine_similarity(single_vec_query[0].reshape(1, -1),sing_vec[0].reshape(1, -1))
    return cosine_sim

def most_similar(dic, k_):
    # Sort the given array arr in reverse order.   
    # Print the first kth largest elements
    sort_dic = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse = True)[:k_]}
    return sort_dic
    



In [7]:
# Example
query = 'work'
doc_terms = ['job', 'sky', 'sweet tea', 'do', 'scientific papers', 'working', 'high performance']

bioBERT_topsim (query, doc_terms, 3)

Downloading the biobert model, will take a minute...




{'do': array([[0.829417]], dtype=float32),
 'job': array([[0.8601637]], dtype=float32),
 'working': array([[0.88085926]], dtype=float32)}

#### Fitting model into the data

In [None]:
import ast
top_sim_by_q = []
for q, d in zip (queries['summary_keyword'], docs['ngrams_tokens']):
  q = ast.literal_eval(q)
  d = ast.literal_eval(d)
  term_top_sim = {}
  for term in q.keys():
    term_top_sim[term] = bioBERT_topsim (term, d,5)
  top_sim_by_q.append(term_top_sim)
  

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
