In [None]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
from torch_geometric import seed_everything
import pathlib

import ray

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

experiment = 220627
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
endpoint_defs = pd.read_feather(f"{output_path}/phecode_defs_220306.feather").sort_values("endpoint")
endpoints_md = pd.read_csv(f"{experiment_path}/endpoints.csv")
endpoints = sorted(endpoints_md.endpoint.to_list())

In [None]:
records=pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/record_frequencies_220627.feather").record.to_list()

In [None]:
data_outcomes = pd.read_feather(f"{output_path}/baseline_outcomes_220627.feather").set_index("eid")

In [None]:
data_records = pd.read_feather(f"{output_path}/baseline_records_220627.feather", columns=["eid"] + records).set_index("eid")
data_all = data_records.merge(data_outcomes, left_index=True, right_index=True, how="left")

In [None]:
eligable_eids = pd.read_feather(f"{output_path}/eligable_eids_220627.feather")
eids_dict = eligable_eids.set_index("endpoint")["eid_list"].to_dict()

# Augment attributions with eligability

In [None]:
path = pathlib.Path('/sc-projects/sc-proj-ukb-cvd/results/recordgraphs/attributions/')
files = list(path.glob('2207*fijcpulg*.feather'))

In [None]:
# make sure its from the correct run

In [None]:
len(files)

In [None]:
def prepare_df(fp):
    fp_split = str(fp).split("_")
    long_df = pd.read_feather(fp)\
            .assign(run_id=fp_split[-5])\
            .assign(iterations=fp_split[-4])\
            .assign(endpoint=fp_split[-3] + "_" + fp_split[-2].replace(".", "-"))\
            .melt(id_vars=["eid", "run_id", "iterations", "endpoint"], var_name="record", value_name="shapley")\
            .dropna()
    return long_df

In [None]:
attribution_df = pd.concat([prepare_df(fp) for fp in tqdm(files)], axis=0)[["eid", "endpoint", "record", "shapley"]].sort_values(["endpoint", "eid", "record"]).reset_index(drop=True)

In [None]:
attribution_df.query("shapley==shapley")

In [None]:
eligable_eids_long = pd.read_feather(f"{output_path}/eligable_eids_long_220627.feather").assign(eligable=1)
attribution_df_eligable = attribution_df.merge(eligable_eids_long, how="left", on=["eid", "endpoint"])#.query("eligable==1")

In [None]:
attribution_df_eligable = attribution_df_eligable.fillna(0).reset_index(drop=True)

In [None]:
# just a short sanity check
attribution_df_agg = attribution_df_eligable.groupby(["endpoint", "record"])[["shapley"]].agg(count=("shapley", "size"), mean_shapley=("shapley", "mean")).sort_values("mean_shapley", ascending=False).reset_index()
concept = pd.read_csv(f"{base_path}/data/mapping/athena/CONCEPT.csv", sep='\t')
concept_clean = concept.assign(record= lambda x: "OMOP_" + x.concept_id.astype(str))[["record", "concept_name", "domain_id", "concept_class_id"]]
attribution_df_prepared = attribution_df_agg.merge(endpoints_md[["endpoint", "phecode_string"]], on="endpoint", how="left").merge(concept_clean, on="record", how="left")
attribution_df_prepared

In [None]:
attribution_df_prepared.query("count>100").query("phecode_string=='Myocardial infarction [Heart attack]'").sort_values("mean_shapley", ascending=False)

In [None]:
pd.set_option('display.max_rows', 100)
test_rank = (attribution_df_prepared.query("count>10")
     .sort_values('mean_shapley')
     .groupby(['endpoint'])
     .tail(10)
     .sort_values("endpoint")
     .reset_index(drop=True)
)[["phecode_string", "concept_name"]]
test_rank['rank'] = test_rank.groupby(["phecode_string"]).cumcount()+1
test_rank.pivot(index="phecode_string", columns='rank',values='concept_name')

In [None]:
attribution_df_eligable.to_feather(f"{experiment_path}/attributions_pre.feather")