In [37]:
from typing import List

import pandas as pd
import numpy as np

import sent2vec
from gensim.models import KeyedVectors
from scipy.spatial import distance

from icecream import ic

from funcs import utils, paths

In [4]:
proj_root = utils.find_project_root()
data_root = utils.find_data_root()

pubtator_dir = data_root / "output" / "pubtator"
assert pubtator_dir.exists()

In [8]:
session_df_path = pubtator_dir / "session_df_sanitized.csv"
session_df = pd.read_csv(session_df_path)

session_df = session_df.assign(
    exists=lambda df: df["session_id"].apply(
        lambda x: (pubtator_dir / "batch_requests" / f"{x}.txt").exists()
    )
)

session_df

Unnamed: 0,batch,term,session_id,exists
0,ebi,gonarthrosis,7562-9938-5522-3240,True
1,ebi,psoriatic and enteropathic arthropathies,9489-1267-1820-7640,True
2,ebi,pain associated with micturition,2941-3822-8420-2463,True
3,ebi,other mood,2730-8351-2793-7515,True
4,ebi,preterm delivery,9914-5677-7332-5034,True
...,...,...,...,...
26443,efo,acetazolamide responsive myotonia,1871-7615-4998-3292,False
26444,efo,complete androgen insensitivity syndrome,6838-6592-5630-1511,False
26445,efo,intermediate dend syndrome,8972-6191-8547-6869,False
26446,efo,epiblepharon,5678-3170-7573-4996,False


In [10]:
for batch in session_df["batch"].drop_duplicates().tolist():
    df = session_df[session_df["batch"] == batch]
    print(batch, len(df), sum(df["exists"]))

ebi 1191 1112
efo 25257 9665


In [14]:
def read_session_res(session_id: str) -> str:
    file_path = pubtator_dir / "batch_requests" / f"{session_id}.txt"
    if not file_path.exists():
        return None
    else:
        with file_path.open("r") as f:
            return f.read()
        
def verify_ner(session_res: str) -> str:
    raw_res = session_res.strip().split("\n")
    if len(raw_res) > 2:
        return raw_res[2:]
    else:
        return None
        
session_res_df = session_df[session_df["exists"]].assign(
    session_res=lambda df: df["session_id"].apply(read_session_res).apply(verify_ner)
).dropna().reset_index(drop=True)

print(session_res_df.info())
session_res_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   batch        1338 non-null   object
 1   term         1338 non-null   object
 2   session_id   1338 non-null   object
 3   exists       1338 non-null   bool  
 4   session_res  1338 non-null   object
dtypes: bool(1), object(4)
memory usage: 43.2+ KB
None


Unnamed: 0,batch,term,session_id,exists,session_res
0,ebi,gonarthrosis,7562-9938-5522-3240,True,[00000\t0\t12\tgonarthrosis\tDisease]
1,ebi,psoriatic and enteropathic arthropathies,9489-1267-1820-7640,True,[00000\t14\t40\tenteropathic arthropathies\tDi...
2,ebi,pain associated with micturition,2941-3822-8420-2463,True,[00000\t0\t4\tpain\tDisease\tMESH:D010146]
3,ebi,gastritis and duodenitis,3350-1413-5269-5786,True,[00000\t0\t9\tgastritis\tDisease\tMESH:D005756...
4,ebi,pre existing hypertension complicating pregnan...,8353-1791-1234-4659,True,[00000\t13\t25\thypertension\tDisease\tMESH:D0...
...,...,...,...,...,...
1333,efo,severe intellectual disability epilepsy an...,4488-8842-4877-4241,True,[00000\t20\t89\tdisability epilepsy anal a...
1334,efo,achalasia microcephaly,9148-6348-6604-6907,True,[00000\t0\t24\tachalasia microcephaly\tDisea...
1335,efo,hypocalcemic vitamin d resistant rickets,6147-9712-7696-3318,True,[00000\t13\t20\tvitamin\tChemical\tMESH:D014807]
1336,efo,7q11 23 microduplication syndrome,5225-3416-3696-8566,True,[00000\t154\t160\tStatus\tDisease\tMESH:D013226]


In [24]:
def ner_res_to_ent_id(ner_res: str) -> str:
    ent_type_idx = 4
    ent_idx = 5
    expect_res_len = 6
    split = ner_res.split("\t")
    if len(split) < expect_res_len:
        return None
    if (len(split[ent_type_idx]) == 0) or (len(split[ent_idx]) == 0):
        return None
    res = "{ent_type}_{ent}".format(ent_type=split[ent_type_idx], ent=split[ent_idx].replace(":", "_"))
    return res

session_res_df = session_res_df.assign(
    ner_res = lambda df: df["session_res"].apply(
        lambda x_list: [ner_res_to_ent_id(_) for _ in x_list]
    ).apply(
        lambda x_list: [_ for _ in x_list if _ is not None]
    ).apply(
        lambda x_list: None if len(x_list) == 0 else x_list
    )
).dropna().reset_index(drop=True)

print(session_res_df.info())
session_res_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   batch        1295 non-null   object
 1   term         1295 non-null   object
 2   session_id   1295 non-null   object
 3   exists       1295 non-null   bool  
 4   session_res  1295 non-null   object
 5   ner_res      1295 non-null   object
dtypes: bool(1), object(5)
memory usage: 52.0+ KB
None


Unnamed: 0,batch,term,session_id,exists,session_res,ner_res
0,ebi,psoriatic and enteropathic arthropathies,9489-1267-1820-7640,True,[00000\t14\t40\tenteropathic arthropathies\tDi...,[Disease_MESH_D001177]
1,ebi,pain associated with micturition,2941-3822-8420-2463,True,[00000\t0\t4\tpain\tDisease\tMESH:D010146],[Disease_MESH_D010146]
2,ebi,gastritis and duodenitis,3350-1413-5269-5786,True,[00000\t0\t9\tgastritis\tDisease\tMESH:D005756...,"[Disease_MESH_D005756, Disease_MESH_D004382]"
3,ebi,pre existing hypertension complicating pregnan...,8353-1791-1234-4659,True,[00000\t13\t25\thypertension\tDisease\tMESH:D0...,[Disease_MESH_D006973]
4,ebi,pre existing hypertensive disorder with superi...,7755-9327-8596-7950,True,[00000\t13\t34\thypertensive disorder\tDisease...,"[Disease_MESH_D006973, Disease_MESH_D011507]"
...,...,...,...,...,...,...
1290,efo,severe intellectual disability epilepsy an...,4488-8842-4877-4241,True,[00000\t20\t89\tdisability epilepsy anal a...,[Disease_MESH_C537766]
1291,efo,achalasia microcephaly,9148-6348-6604-6907,True,[00000\t0\t24\tachalasia microcephaly\tDisea...,[Disease_MESH_C536010]
1292,efo,hypocalcemic vitamin d resistant rickets,6147-9712-7696-3318,True,[00000\t13\t20\tvitamin\tChemical\tMESH:D014807],[Chemical_MESH_D014807]
1293,efo,7q11 23 microduplication syndrome,5225-3416-3696-8566,True,[00000\t154\t160\tStatus\tDisease\tMESH:D013226],[Disease_MESH_D013226]


In [40]:
IDX = 4

In [42]:
term = session_res_df.iloc[IDX]["term"]
ic(term)

session_res = [_.split("\t") for _ in session_res_df.iloc[IDX]["session_res"]]
ic(session_res)

ner_res = session_res_df.iloc[IDX]["ner_res"]
ic(ner_res)

2022-05-26 13:40:14.604917 |> term: 'pre existing hypertensive disorder with superimposed proteinuria'
2022-05-26 13:40:14.632886 |> session_res: [['00000', '13', '34', 'hypertensive disorder', 'Disease', 'MESH:D006973'],
                                            ['00000', '53', '64', 'proteinuria', 'Disease', 'MESH:D011507']]
2022-05-26 13:40:14.678557 |> ner_res: ['Disease_MESH_D006973', 'Disease_MESH_D011507']


['Disease_MESH_D006973', 'Disease_MESH_D011507']

In [28]:
bioconceptvec_model_path = proj_root / "models" / "bioconceptvec" / "bioconceptvec_word2vec_skipgram.bin"
assert bioconceptvec_model_path.exists()

bioconceptvec_embeddings = KeyedVectors.load_word2vec_format(str(bioconceptvec_model_path), binary=True)

In [29]:
biosentvec_model_path = paths.init["biosentvec_model"]
assert biosentvec_model_path.exists()

biosentvec_model = sent2vec.Sent2vecModel()
biosentvec_model.load_model(str(biosentvec_model_path))

In [43]:
biosentvec_vector = biosentvec_model.embed_sentence(term)
bioconceptvec_vectors = [
    bioconceptvec_embeddings[_]
    for _ in ner_res if _ in bioconceptvec_embeddings.key_to_index.keys()
]
print(biosentvec_vector.shape)
print(bioconceptvec_vectors[0].shape)

(1, 700)
(100,)


In [34]:
def harmonize_vectors(main_vector: np.ndarray, addons: List[np.ndarray]) -> np.ndarray:
    addon_shape = (100,)
    main_vector_shape = (1, 700)
    pad_width = int((700 - 100) / 2)
    addons_padded = [
        np.pad(_, pad_width, mode="constant", constant_values=(0)).reshape(main_vector_shape)
        for _ in addons
    ]
    res_vector = main_vector
    for _ in addons_padded:
        res_vector = res_vector + _
    return res_vector

In [44]:
biosentvec_bioconceptvec_vector = harmonize_vectors(
    main_vector=biosentvec_vector,
    addons=bioconceptvec_vectors
)
print(biosentvec_bioconceptvec_vector.shape)

(1, 700)


In [45]:
1 - distance.cosine(biosentvec_vector, biosentvec_bioconceptvec_vector)

0.6307975649833679