In [12]:
import requests
import pandas as pd

from analysis_funcs import settings, paths

import local_utils

In [6]:
data_dir = paths.data_root
SUB_PROJ_NAME = "mvp-ontology-terms-2023-03"
OUTPUT_DIR = data_dir / "output" / SUB_PROJ_NAME
assert OUTPUT_DIR, OUTPUT_DIR

In [13]:
ES_URL = settings.es_url
r = requests.get(ES_URL)
assert r.ok, ES_URL

In [19]:
def gen_target_vector_payload(target_id):
    res = {
        "query": {
           "term": {"ent_id": target_id},
        },
        "_source": ["ent_id", "vector_term", "vector"]
    }
    return res

def get_target_vector(target_id: str, source: str):
    index_name = local_utils.get_es_index_for_source(source=source, trial=False)
    url = ES_URL + f"/{index_name}/_search"
    payload = gen_target_vector_payload(target_id=target_id)
    r = requests.get(url, json=payload)
    r.raise_for_status
    results = r.json()["hits"]["hits"]
    res = results[0]["_source"]
    res["source"] = source
    return res

In [10]:
path_terms_df = OUTPUT_DIR / "clean_terms.csv"
assert path_terms_df.exists(), path_terms_df
clean_terms = pd.read_csv(path_terms_df)
clean_terms.info()
clean_terms

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70301 entries, 0 to 70300
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          70301 non-null  object
 1   term        70301 non-null  object
 2   source      70301 non-null  object
 3   term_clean  70301 non-null  object
dtypes: object(4)
memory usage: 2.1+ MB


Unnamed: 0,id,term,source,term_clean
0,Safety:1:MONDO_0020460,acquired von willebrands disease,Safety,acquired von willebrands disease
1,Safety:1:MONDO_0015129,addisons disease,Safety,addisons disease
2,Safety:1:MONDO_0008762,alports syndrome,Safety,alports syndrome
3,Safety:1:MONDO_0010520,alports syndrome,Safety,alports syndrome
4,Safety:1:MONDO_0018965,alports syndrome,Safety,alports syndrome
...,...,...,...,...
70296,GWAS_Catalog:11:EFO_0006510,zoster infection,GWAS_Catalog,zoster infection
70297,OMIM:1:MONDO_0014936,zttk syndrome,OMIM,zttk syndrome
70298,Clinvar:1:MONDO_0014936,zttk syndrome,Clinvar,zttk syndrome
70299,Knockout_models:1:MGI_knockout_9748,zygomatic arch hypoplasia,Knockout_models,zygomatic arch hypoplasia


In [11]:
subset_df = clean_terms[clean_terms["term_clean"].apply(lambda x: "coronary" in x)].reset_index(drop=True)
subset_df.info()
subset_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          350 non-null    object
 1   term        350 non-null    object
 2   source      350 non-null    object
 3   term_clean  350 non-null    object
dtypes: object(4)
memory usage: 11.1+ KB


Unnamed: 0,id,term,source,term_clean
0,GWAS_Catalog:3:EFO_0021543,1-methylhistidine levels in coronary artery di...,GWAS_Catalog,1-methylhistidine levels in coronary artery di...
1,GWAS_Catalog:4:EFO_0021543,1-methylhistidine levels in coronary artery di...,GWAS_Catalog,1-methylhistidine levels in coronary artery di...
2,GWAS_Catalog:3:EFO_0021174,1-methylxanthine levels in coronary artery dis...,GWAS_Catalog,1-methylxanthine levels in coronary artery dis...
3,GWAS_Catalog:4:EFO_0021174,1-methylxanthine levels in coronary artery dis...,GWAS_Catalog,1-methylxanthine levels in coronary artery dis...
4,GWAS_Catalog:1:EFO_0021552,2-methylsuccinic acid levels in coronary arter...,GWAS_Catalog,2-methylsuccinic acid levels in coronary arter...
...,...,...,...,...
345,GWAS_Catalog:18:EFO_0004761,uric acid levels in coronary artery disease,GWAS_Catalog,uric acid levels in coronary artery disease
346,GWAS_Catalog:3:EFO_0010546,uridine levels in coronary artery disease,GWAS_Catalog,uridine levels in coronary artery disease
347,GWAS_Catalog:2:EFO_0021676,urocanic acid levels in coronary artery disease,GWAS_Catalog,urocanic acid levels in coronary artery disease
348,GWAS_Catalog:1:EFO_0007051,vein graft stenosis in coronary artery bypass ...,GWAS_Catalog,vein graft stenosis in coronary artery bypass ...


In [17]:
# example target id

TARGET_ID = "GWAS_Catalog:3:EFO_0021543"
SOURCE = "GWAS_Catalog"

res = get_target_vector(target_id=TARGET_ID, source=SOURCE)
res

{'ent_id': 'GWAS_Catalog:3:EFO_0021543',
 'vector_term': '1-methylhistidine levels in coronary artery disease',
 'vector': [0.15066923201084137,
  0.16206885874271393,
  0.0523233562707901,
  -0.07461526989936829,
  0.09177636355161667,
  -0.11675640940666199,
  -0.026920342817902565,
  0.07486748695373535,
  0.006536351982504129,
  -0.03451351821422577,
  -0.22848261892795563,
  -0.02418621815741062,
  0.2058793306350708,
  -0.07559361308813095,
  -0.018484430387616158,
  -0.08090201765298843,
  -0.07190059870481491,
  0.07653684169054031,
  0.04173903167247772,
  0.14553700387477875,
  0.11827245354652405,
  -0.0004900693893432617,
  0.13092118501663208,
  -0.007489947136491537,
  0.04157186672091484,
  -0.0568133182823658,
  0.1624237447977066,
  0.11777066439390182,
  0.06510361284017563,
  -0.01651606522500515,
  -0.007963019423186779,
  -0.16137419641017914,
  0.030894190073013306,
  -0.2135588526725769,
  0.08811839669942856,
  -0.13688649237155914,
  0.2201194018125534,
  0.154

In [20]:
vector_info = pd.DataFrame([
    get_target_vector(target_id=_["id"], source=_["source"])
    for _ in subset_df.to_dict(orient="records")
])
vector_info.info()
vector_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ent_id       350 non-null    object
 1   vector_term  350 non-null    object
 2   vector       350 non-null    object
 3   source       350 non-null    object
dtypes: object(4)
memory usage: 11.1+ KB


Unnamed: 0,ent_id,vector_term,vector,source
0,GWAS_Catalog:3:EFO_0021543,1-methylhistidine levels in coronary artery di...,"[0.15066923201084137, 0.16206885874271393, 0.0...",GWAS_Catalog
1,GWAS_Catalog:4:EFO_0021543,1-methylhistidine levels in coronary artery di...,"[0.15066923201084137, 0.16206885874271393, 0.0...",GWAS_Catalog
2,GWAS_Catalog:3:EFO_0021174,1-methylxanthine levels in coronary artery dis...,"[0.05016488954424858, 0.11256597191095352, 0.0...",GWAS_Catalog
3,GWAS_Catalog:4:EFO_0021174,1-methylxanthine levels in coronary artery dis...,"[0.05016488954424858, 0.11256597191095352, 0.0...",GWAS_Catalog
4,GWAS_Catalog:1:EFO_0021552,2-methylsuccinic acid levels in coronary arter...,"[0.09945414215326309, 0.12764762341976166, 0.0...",GWAS_Catalog
...,...,...,...,...
345,GWAS_Catalog:18:EFO_0004761,uric acid levels in coronary artery disease,"[0.08915071189403534, 0.14681017398834229, 0.0...",GWAS_Catalog
346,GWAS_Catalog:3:EFO_0010546,uridine levels in coronary artery disease,"[0.10234122723340988, 0.12592585384845734, 0.0...",GWAS_Catalog
347,GWAS_Catalog:2:EFO_0021676,urocanic acid levels in coronary artery disease,"[0.05687214806675911, 0.1460123062133789, 0.01...",GWAS_Catalog
348,GWAS_Catalog:1:EFO_0007051,vein graft stenosis in coronary artery bypass ...,"[0.11790578067302704, 0.24837318062782288, -0....",GWAS_Catalog


In [24]:
def format_vector(vector):
    res = "\t".join([str(_) for _ in vector])
    return res

vector_data = [
    format_vector(_)
    for _ in vector_info["vector"].tolist()
]

In [None]:
path_vector_output = OUTPUT_DIR / "example_vector.tsv"
with path_vector_output.open("w") as f:
    for _ in vector_data:
        f.write(_ + "\n")

In [36]:
vector_metadata = vector_info.assign(
    label=lambda df: df["vector_term"])[["label", "source"]]

path_metadata_output = OUTPUT_DIR / "example_vector_metadata.tsv"
vector_metadata.to_csv(path_metadata_output, sep="\t", index=False)