In [1]:
%load_ext autoreload
%load_ext lab_black

%autoreload 2

In [122]:
import json
from typing import List, Dict, Any, Union

import requests
import numpy as np
import pandas as pd
from pydash import py_
from pydash.objects import assign
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt

from analysis import utils
from analysis.settings import config
from analysis.funcs.generic import interval_str
from common_processing.resources import EVIDENCE_LABELS, epigraphdb

----

# Setup params

In [3]:
INTERVAL = "2020-01-01/2021-12-31"
interval_fmt = interval_str(INTERVAL)
DATA_ROOT = utils.find_data_root()

# key dirs
experiment_data_dir = DATA_ROOT / "medrxiv_experiments" / interval_fmt / "default"
assert experiment_data_dir.exists()
print(f"{experiment_data_dir=}")

analysis_dir = DATA_ROOT / "analysis"
analysis_dir.mkdir(exist_ok=True)
print(f"{analysis_dir=}")

analysis_assets_dir = utils.find_analysis_artifacts_dir()
assert analysis_assets_dir
print(f"{analysis_assets_dir=}")

experiment_data_dir=PosixPath('/data/ik18445_cache/projects/epigraphdb-asq/data/medrxiv_experiments/2020-01-01__2021-12-31/default')
analysis_dir=PosixPath('/data/ik18445_cache/projects/epigraphdb-asq/data/analysis')
analysis_assets_dir=PosixPath('/data/ik18445_cache/projects/epigraphdb-asq/data/analysis-artifacts')


In [4]:
config

Config(semrep_api_url='http://localhost:8067', melodi_presto_api_url='https://melodi-presto.mrcieu.ac.uk/api', medline_api_url='http://ieu-db-interface.epi.bris.ac.uk:6451', epigraphdb_api_url='http://ieu-mrbssd1.epi.bris.ac.uk:28046', epigraphdb_web_backend_url='http://ieu-mrbssd1.epi.bris.ac.uk:28050', epigraphdb_neural_url='http://ieu-mrbssd1.epi.bris.ac.uk:28015', neural_transformers_url='http://ieu-mrbssd1.epi.bris.ac.uk:8017', neural_models_url='http://ieu-mrbssd1.epi.bris.ac.uk:8016', epigraphdb_es_url='http://ieu-mrbssd1.epi.bris.ac.uk:26550', backend_url='http://localhost:8615', data_path=PosixPath('/data/ik18445_cache/projects/epigraphdb-asq/data'))

----

# General params

In [219]:
THRESHOLDS = [
    {"label": "epigraphdb", "threshold": None},
    {"label": "asq_0.99", "threshold": 0.99},
    {"label": "asq_0.95", "threshold": 0.95},
    {"label": "asq_0.90", "threshold": 0.90},
    {"label": "asq_0.85", "threshold": 0.85},
    {"label": "asq_0.80", "threshold": 0.80},
    {"label": "asq_default(0.7)", "threshold": 0.7},
]
ASQ_THRESHOLDS = THRESHOLDS[1 : len(THRESHOLDS)]
THRESHOLDS_LABELS_ORDERED = [_["label"] for _ in THRESHOLDS]

----

# Funcs

In [46]:
def asq_check_similarity_match(e: Dict[str, Any], threshold: Union[float, str]):
    def _check(items: Dict[str, Any], threshold: Union[float, str]):
        if isinstance(threshold, str):
            res = len(e) > 0
        else:
            items_pass = [_ for _ in items if _["similarity_score"] >= threshold]
            res = len(items_pass) > 0
        return res

    res = [
        {
            "term": e["subject_term"],
            "gwas": _check(e["trait_ents"]["subject_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["subject_ents"], threshold),
        },
        {
            "term": e["object_term"],
            "gwas": _check(e["trait_ents"]["object_ents"], threshold),
            "literature_term": _check(e["umls_ents"]["object_ents"], threshold),
        },
    ]
    return res

In [79]:
# find those above thresholds,
# instead of boolean
def asq_find_similarity_match(e: Dict[str, Any], threshold: Union[float, str]):
    def _find(items, threshold):
        if isinstance(threshold, str):
            res = items
        else:
            items_pass = [_ for _ in items if _["similarity_score"] >= threshold]
            res = items_pass
        return res

    res = [
        {
            "term": e["subject_term"],
            "gwas": _find(e["trait_ents"]["subject_ents"], threshold),
            "literature_term": _find(e["umls_ents"]["subject_ents"], threshold),
        },
        {
            "term": e["object_term"],
            "gwas": _find(e["trait_ents"]["object_ents"], threshold),
            "literature_term": _find(e["umls_ents"]["object_ents"], threshold),
        },
    ]
    return res