```
# biosentvec-bioconceptvec

For a term label of interest
- use biosentvec to obtain sentence embedding
- use bioconceptvec to obtain entity embedding of mentioned entities
  - use pubtator to perform NER on the term label => obtain the entity terms and their ids
  - get entity embedding from bioconceptvec
- harmonize sentence vector and entity vector(s): pad the shorter entity vector to match the shape of the sentence vector, then do simple element wise sum
- => get the biosentvec-bioconceptvec vector for the term label
```

In [43]:
EFO_TERM = "genetic eye tumor"

QUERY_TERM = "eye tumor"

ENT_0 = "tumor"
ENT_1 = "genetic diseases"

In [48]:
ENT_0_ID = "Disease_MESH_D009369" # NER results from pubtator, or from the bioconceptvec vocab

In [49]:
ENT_1_ID = "Disease_MESH_D030342"

---

In [120]:
import requests
from pprint import pprint
from typing import List

import sent2vec
from scipy.spatial import distance
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np

from funcs import utils, paths

In [45]:
proj_root = utils.find_project_root()
data_root = utils.find_data_root()
bioconceptvec_dir = data_root / "bioconceptvec"
assert bioconceptvec_dir.exists()

sample_path = bioconceptvec_dir / "bioconcepts2pubtatorcentral.sample"
assert sample_path.exists()

sample_df = pd.read_csv(sample_path, sep="\t", names=["idx", "ent_type", "ent_id", "term", "source"]).dropna().reset_index(drop=True)

sample_df

Unnamed: 0,idx,ent_type,ent_id,term,source
0,3958000,Species,9606,patients,SR4GN
1,3958000,Disease,MESH:D012544,Scheuermann's kyphosis,TaggerOne
2,23574000,Chemical,MESH:D008012,Astrazeneca,TaggerOne
3,23574000,Species,9615,dogs,SR4GN
4,23574000,Disease,MESH:D007153,antibody omalizumab,TaggerOne
...,...,...,...,...,...
802,20635000,Mutation,rs779184767,C243A,tmVar
803,20635000,Gene,155030,Gag,GNormPlus
804,20635000,Gene,57379,AID,GNormPlus
805,20635000,Chemical,MESH:D009584,N,TaggerOne


----

In [22]:
biosentvec_model_path = paths.init["biosentvec_model"]
assert biosentvec_model_path.exists()

biosentvec_model = sent2vec.Sent2vecModel()
biosentvec_model.load_model(str(biosentvec_model_path))

In [80]:
efo_term_biosentvec = biosentvec_model.embed_sentence(EFO_TERM)
print(efo_term_biosentvec.shape)

query_term_biosentvec = biosentvec_model.embed_sentence(QUERY_TERM)

print(1 - distance.cosine(efo_term_biosentvec, query_term_biosentvec))

(1, 700)
0.8159835934638977


----

In [46]:
# vocab
sample_df[sample_df["term"].apply(lambda x: ENT_0 in x)]

Unnamed: 0,idx,ent_type,ent_id,term,source
59,17505000,Disease,MESH:D018358,malignant neuroendocrine tumors|neuroendocrine...,TaggerOne|MESH
61,17505000,Disease,MESH:D009369,Tumors|tumor|tumors,TaggerOne
64,17505000,Disease,MESH:D010190,pancreatic endocrine tumors,TaggerOne
86,20337000,Disease,MESH:D009369,tumor,TaggerOne
375,7444000,Disease,MESH:D009369,tumor,TaggerOne
448,9191000,Disease,MESH:D009369,tumors,TaggerOne
693,27992000,Disease,MESH:D009369,tumor,TaggerOne
696,27992000,Disease,MESH:D018302,dysembryoplastic neuroepithelial tumors,TaggerOne
697,27992000,Disease,MESH:D001932,brain tumors|brain tumor,TaggerOne
706,22884000,Gene,7124,tumor necrosis factor a|TNF-a,GNormPlus


----

In [38]:
bioconceptvec_model_path = proj_root / "models" / "bioconceptvec" / "bioconceptvec_word2vec_skipgram.bin"
assert bioconceptvec_model_path.exists()

bioconceptvec_embeddings = KeyedVectors.load_word2vec_format(str(bioconceptvec_model_path), binary=True)

In [50]:
assert ENT_0_ID in bioconceptvec_embeddings.key_to_index.keys()
assert ENT_1_ID in bioconceptvec_embeddings.key_to_index.keys()

In [51]:
ent_0_embedding = bioconceptvec_embeddings[ENT_0_ID]
ent_1_embedding = bioconceptvec_embeddings[ENT_1_ID]
print(ent_0_embedding.shape)
print(ent_1_embedding.shape)

(100,)
(100,)


----

In [109]:
1 - distance.cosine(efo_term_biosentvec, query_term_biosentvec)

0.8159835934638977

In [129]:
def harmonize_vectors(main_vector: np.ndarray, addons: List[np.ndarray]) -> np.ndarray:
    addon_shape = (100,)
    main_vector_shape = (1, 700)
    pad_width = int((700 - 100) / 2)
    addons_padded = [
        np.pad(_, pad_width, mode="constant", constant_values=(0)).reshape(main_vector_shape)
        for _ in addons
    ]
    res_vector = main_vector
    for _ in addons_padded:
        res_vector = res_vector + _
    return res_vector

In [132]:
efo_term_bioconceptvec = harmonize_vectors(efo_term_biosentvec, [ent_0_embedding, ent_1_embedding])
print(efo_term_bioconceptvec.shape)

query_term_bioconceptvec = harmonize_vectors(query_term_biosentvec, [ent_0_embedding])

(1, 700)


In [133]:
1 - distance.cosine(efo_term_bioconceptvec, query_term_bioconceptvec)

0.7836057543754578

In [134]:
1 - distance.cosine(efo_term_bioconceptvec, efo_term_biosentvec)

0.8426003456115723

In [135]:
1 - distance.cosine(query_term_bioconceptvec, query_term_biosentvec)

0.9586260914802551