In [None]:
import pickle
import pandas as pd
import os
import glob
import numpy as np
import pathlib

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_name = "210714_metabolomics"
#data_path = "/data/analysis/ag-reils/steinfej"
data_pre = f"{base_path}/data/2_datasets_pre/{project_name}"
data_post = f"{base_path}/data/3_datasets_post/{project_name}"

project_label = "21_metabolomics_multitask"
project_path = f"{base_path}/results/projects/{project_label}"
figures_path = f"{project_path}/figures"
data_results_path = f"{project_path}/data"
pathlib.Path(figures_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(data_results_path).mkdir(parents=True, exist_ok=True)

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=1, threads_per_worker=20)
client = Client(cluster)

In [None]:
tag = "211006"
path = os.getcwd()
base_path = "/sc-projects/sc-proj-ukb-cvd"
csv_files = glob.glob(os.path.join(f"/sc-projects/sc-proj-ukb-cvd/results/models/NeptuneLogger/interpretability/211026/shap/DeepExplainer/", "attribution*.csv"))
charite_path = "/sc-projects/sc-proj-ukb-cvd/results/models/NeptuneLogger/interpretability/shap/DeepExplainer/"

In [None]:
def get_endpoint(s):
    return s.str.replace(f"/sc-projects/sc-proj-ukb-cvd/results/models/NeptuneLogger/interpretability/211026/shap/DeepExplainer/attribution_", "").str.replace("DeepExplainer_", "").str.replace("_10.csv", "")

def adapt_paths(s):
    return s.str.replace(f"{charite_path}", f"{base_path}").replace("_10.0.csv", "")

In [None]:
df = pd.concat([pd.read_csv(p, index_col="run_id").assign(endpoint=p) for p in csv_files]).drop(columns=["Unnamed: 0"])

In [None]:
def load_pickle(p):
    with open(p, 'rb') as f:
        x = pickle.load(f)
    return x

In [None]:
def aggregate_attributions(i):
    run = df.iloc[i].copy()
    run["feature_names"] = ["_".join(s.split("_")[2:]) for s in eval(run["feature_names"])]#[12:-1]#+replace(r'Partition[0-9]+_[a-zA-Z]+_*', "")#.replace("Partition", "")
    shap = pd.read_csv(run["shap_filepath"], names=run["feature_names"]).assign(eid=eval(run["eids"])).melt(id_vars="eid", value_vars = run["feature_names"], var_name="metabolite", value_name="shap")
    met_values = pd.read_csv(run["data_filepath"], names=run["feature_names"]).assign(eid=eval(run["eids"])).melt(id_vars="eid",value_vars = run["feature_names"], var_name="metabolite", value_name="met_value")
    data = shap.merge(met_values, on=["eid", "metabolite"], how="left").assign(partition=run["partition"], endpoint=run["endpoint"], run_id=run.name)
    return data

In [None]:
from tqdm.auto import tqdm
import joblib

In [None]:
df.endpoint = get_endpoint(df.endpoint)

In [None]:
with joblib.parallel_backend('dask'):
    df_agg_list = joblib.Parallel(verbose=100)(
        joblib.delayed(aggregate_attributions)(i)
        for i in tqdm(range(len(df))))

In [None]:
df_agg = pd.concat(df_agg_list).reset_index(drop=True)

In [None]:
df_agg.eid.nunique()

In [None]:
for col in ["eid", "metabolite", "partition", "endpoint", "run_id"]: df_agg[col] = df_agg[col].astype("category")

In [None]:
df_agg.to_feather(f"{data_results_path}/attributions_211026.feather")