# An Example of how to use Siamese BioBERT to create IR system

In [4]:
import pandas as pd
import glob
import os
from elasticsearch import Elasticsearch, helpers
import requests
import numpy as np
import csv
import sys

ROOT_DIR = '../'
sys.path.append(ROOT_DIR)

from utils.model_data_utils import *
from utils.annoy_helper import *

In [2]:
# Set default device to cuda if available
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Setting UP Elastic Search Parameters

In [3]:
ES_HOST = '172.17.0.3'
ES_PORT = '9200'
EVAL_INDEX = 'uniprot_protein'

This setting is specific to Novartis machine as we need to connect to local host in this example. Thus, we will remove all env variable for this notebook

In [13]:
os.environ['http_proxy'] = ""
os.environ['HTTP_PROXY'] = ""
os.environ['https_proxy'] = ""
os.environ['HTTPS_PROXY'] = ""
os.environ['NO_PROXY'] = ""
os.environ['no_proxy'] = ""

## Elastic Search Indexing

In [23]:
es = Elasticsearch([ {'host': ES_HOST, 'port': ES_PORT}])

### Select Dataset to index
In this case we will use reference_data.tsv as an example dataset to index

In [9]:
REFERENCE_DATA_PATH = os.path.join(ROOT_DIR, 'data/reference_data.tsv')
reference_copus_df = pd.read_csv(REFERENCE_DATA_PATH, delimiter='\t')
reference_copus_df

Unnamed: 0,name,id
0,14-3-3 protein beta/alpha,P31946
1,Protein 1054,P31946
2,Protein kinase C inhibitor protein 1,P31946
3,KCIP-1,P31946
4,"14-3-3 protein beta/alpha, N-terminally processed",P31946
...,...,...
124465,y(_)L-type amino acid transporter 2,Q92536
124466,y_ system cationic amino acid transporter,Q01650
124467,y_LAT-1,Q9UM01
124468,y_LAT-2,Q92536


In [26]:
def index_to_es_embedding(corpus_df, index='', model=None):
    mod_factor = len(corpus_df)//10
    for idx, row in corpus_df.iterrows():
        if idx % mod_factor == 0 or idx == len(corpus_df)-1:
            print(idx*100/len(corpus_df))
            
        payload = dict()
        payload['name'] = row['name'].strip()
        payload['qid'] = row['id']

        if model:
            word_emb = model.encode(row['name'], device=device)
            payload['word_embedding'] = word_emb
            
        try:
            res = es.create(index=index, body=payload, id=str(idx))
        except Exception as e:
            print(e)

### Create an index mapping for Elasticsearch
Before we start indexing with Elasticsearch, we have to define how we want to index our items first.

In [None]:
es.indices.create(
    index=EVAL_INDEX,
    body={
        "mappings": {
            "properties": {
                "name": {
                    "type": "text"
                },
                "qid": {
                    "type": "keyword"
                },
                "word_embedding": {
                    "type": "dense_vector",
                    "dims": 128 # 768 by default
                }
            }
        }
    },
    ignore=400
)

In [None]:
model_name = None # Please specify model you want to use here
model = get_model(model_name, device=device)
index_to_es_embedding(reference_copus_df, index=EVAL_INDEX, model=model)

## Query to Elasticsearch with our embedding

### A Function to Query ElasticSearch with Fuzzy Match

In [27]:
def query_elastic_search_fuzzy(keyword, **kwargs):
    url = "http://{}:{}/{}/_search".format(ES_HOST, ES_PORT, EVAL_INDEX)
    payload = {
        "size": top_k,
        "query": {
            "match" : {
                "name" : {
                    "query" : keyword,
                    "fuzziness": "auto"
                }
            }
        }
    }
    try:
        top_k = kwargs['top_k']
        r = requests.get(url, json = payload)
        hits = r.json()['hits']['hits']
        hits = [(doc['_source']['name'], doc['_source']['qid']) for doc in hits[:top_k]]
        return hits
    except Exception as e:
        print(e)
        return []

### A Function to Query Elasticsearch with ElasticSearch Vector Field (Linear Scan) 

In [28]:
def query_elastic_search_with_embed(keyword, **kwargs):
    if 'model' not in kwargs:
        print("No Model Found, returning without querying ES")
        return []
    model = kwargs['model']
    top_k = kwargs['top_k'] if 'top_k' in kwargs else 10
    url = "http://{}:{}/{}/_search".format(ES_HOST, ES_PORT, EVAL_INDEX)
    vector = list(model.encode(str(keyword), device=device).astype(float))
    payload = {
        "size": top_k,
        "query": {
            "script_score": {
                "query" : {
                    "match_all" : {}
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'word_embedding') + 1.0", 
                    "params": {
                        "query_vector": vector
                    }
                }
            },
        }
    }
    try:
        r = requests.get(url, json = payload)
        hits = r.json()['hits']['hits']
        hits = [(doc['_source']['name'], doc['_source']['qid']) for doc in hits[:top_k]]
        return hits
    except Exception as e:
        print(e)
        return []

### A Function Query Elasticsearch with Vector and Fuzzy Match

This function will filter the search result using Fuzzy Match and then rerank them with Embedded vector

In [29]:
def query_elastic_search_with_fuzzy_and_embed(keyword, **kwargs):
    if 'model' not in kwargs:
        print("No Model Found, returning without querying ES")
        return []
    model = kwargs['model']
    top_k = kwargs['top_k'] if 'top_k' in kwargs else 10
    url = "http://{}:{}/{}/_search".format(ES_HOST, ES_PORT, EVAL_INDEX)
    vector = list(model.encode(str(keyword), device=device).astype(float))
    payload = {
        "size": top_k,
        "query": {
            "script_score": {
                "query" : {
                    "match" : {
                        "name" : {
                            "query" : keyword,
                            "fuzziness": "auto"
                        }
                    }
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'word_embedding') + 1.0", 
                    "params": {
                        "query_vector": vector
                    }
                }
            },
        }
    }
    try:
        r = requests.get(url, json = payload)
        hits = r.json()['hits']['hits']
        hits = [(doc['_source']['name'], doc['_source']['qid']) for doc in hits[:top_k]]
        return hits
    except Exception as e:
        print(e)
        return []

### Querying Example

In [None]:
search_term = 'Protein kinase C'
query_elastic_search_fuzzy(search_term, top_k=10)

In [None]:
query_elastic_search_with_embed(search_term, model=model, top_k=10)

In [None]:
query_elastic_search_with_fuzzy_and_embed(search_term, model=model, top_k=10)

## Annoy Indexing
To use Annoy as an indexed reference data, we need to first create annoy file and then issue the query to it

In [27]:
import annoy

In [29]:
model_name = None
model = get_model(model_name, device=device)
annoy_object_wrapper = AnnoyObjectWrapper(index_path='./protein-embedding-4096-trees.ann', 
                          embedding_path='./protein-768-embedding.pkl', 
                          reference_dataset_path=REFERENCE_DATA_PATH, 
                          name2id_path='./protein-name2id-embedding-size-1500000', 
                          model=model, n_trees=4096, embedding_size=768, max_corpus_size=1500000)
annoy_object_wrapper.create_embedding_and_index(create_new_embedding=True, create_new_index=True)

Load pre-computed embeddings from disc


In [7]:
def get_id2name(dataset_path):
    all_name = pd.read_csv(dataset_path, delimiter='\t')
    all_name = all_name.dropna() 
    
    id2name = {}
    for idx, row in all_name.iterrows():
        if row['id'].strip() not in id2name:
            id2name[row['id']] = set()
        id2name[row['id']].add(row['name'].strip())
        
    return id2name


def get_name2id(dataset_path):
    all_name = pd.read_csv(dataset_path, delimiter='\t')
    all_name = all_name.dropna() 
    
    name2id = {}
    for idx, row in all_name.iterrows():
        if row['name'].strip() not in name2id:
            name2id[row['name']] = set()
        name2id[row['name']].add(row['id'].strip())
        
    return name2id

In [10]:
id2name = get_id2name(REFERENCE_DATA_PATH)
name2id = get_name2id(REFERENCE_DATA_PATH)

### An example of how to query with Annoy Indexing
To query with annoy index, we need to supply an extra argument so that we can get the actual id of the reference data. Since Annoy use its own indexing, we cannot use the value return from Annoy as a reference data id

In [11]:
def generate_candidate_with_annoy(keyword, **kwargs):
    model = kwargs['model']
    annoy_object_wrapper = kwargs['annoy_object_wrapper']
    id2name = kwargs['id2name']
    top_k = kwargs['top_k']
    name2id = kwargs['name2id']
    
    corpus_sentences = annoy_object_wrapper.embedding_object.corpus_sentences
    corpus_embeddings = annoy_object_wrapper.embedding_object.corpus_embeddings
    name_to_id = annoy_object_wrapper.embedding_object.name_to_id
    annoy_index = annoy_object_wrapper.annoy_index

    query_embedding = model.encode(str(keyword), device=device)

    found_corpus_ids, scores = annoy_index.get_nns_by_vector(query_embedding, top_k, include_distances=True)
    hits = []

    for _id, score in zip(found_corpus_ids, scores):
        # Cosine Distance is equivalent to Euclidean distance of normalized vectors = sqrt(2-2*cos(u, v))
        # cosine_dist = sqrt(2-2*cos(u,v))
        # Thus cos(u,v) = 1-(cosine_dist**2)/2
        hits.append({'corpus_id': _id, 'score': 1 - ((score ** 2) / 2)})

    end_time = time.time()
    
    return_hits= []

    for hit in hits:
        name = corpus_sentences[hit['corpus_id']]
        possible_id = name2id.get(corpus_sentences[hit['corpus_id']], [])
        for _id in possible_id:
            return_hits.append((name, _id))

    return return_hits

In [None]:
search_term = 'Protein Kinase C'
generate_candidate_with_annoy(search_term, model=model, top_k=10, annoy_object_wrapper=annoy_object_wrapper, id2name=id2name, name2id=name2id)