In [1]:
import numpy as np
import pandas as pd
import pathlib
import os
from tqdm.auto import tqdm


from omegaconf import DictConfig, OmegaConf

import torch
#from torch_geometric import seed_everything

import ray

In [2]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230323'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(22)]

## Process Predictions

In [3]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_md.feather").set_index("endpoint")
endpoints_md

Unnamed: 0_level_0,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,61256,3548,0.057921,4306655.00,All-Cause Death,Death,Both
phecode_002,60945,658,0.010797,2.00,Staphylococcus,ID,Both
phecode_002-1,61010,486,0.007966,2.10,Staphylococcus aureus,ID,Both
phecode_003,60757,1017,0.016739,3.00,Escherichia coli,ID,Both
phecode_004,60584,494,0.008154,4.00,Streptococcus,ID,Both
...,...,...,...,...,...,...,...
phecode_977-52,31669,520,0.016420,977.52,Hormone replacement therapy (postmenopausal),Rx,Female
phecode_977-7,60032,2231,0.037164,977.70,Long term (current) use of insulin or oral hyp...,Rx,Both
phecode_977-71,60936,472,0.007746,977.71,Long term (current) use of insulin,Rx,Both
phecode_977-72,60207,2148,0.035677,977.72,Long term (current) use of oral hypoglycemic d...,Rx,Both


In [4]:
id_vars = ["eid", "model", "partition", "split"]

In [5]:
out_path = f"{experiment_path}/loghs"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [6]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")

In [7]:
data_outcomes_long = pd.read_feather(f"{base_path}/data/data_outcomes_long_230320.feather")

In [8]:
data_outcomes_long.query("event==1").sample(10)

Unnamed: 0,eid,endpoint,prev,event,time
3280346,RID04343,phecode_462,0.0,1.0,9.295003
813617,RID01882,phecode_120-2,1.0,1.0,0.300707
2877151,RID07098,phecode_406,0.0,1.0,3.275838
2137037,RID06262,phecode_329-9,0.0,1.0,7.939767
915632,RID02435,phecode_130,1.0,1.0,5.564225
2965673,RID01267,phecode_416,0.0,1.0,9.965777
3036801,RID07298,phecode_430,0.0,1.0,9.396304
1308961,RID04416,phecode_177-2,0.0,1.0,3.559206
693498,RID05044,phecode_108-5,1.0,1.0,0.237166
3894883,RID02768,phecode_522-12,0.0,1.0,4.136893


In [9]:
endpoint_selection = [
    # generally very important
    "phecode_202", # Diabetes mellitus
    "phecode_401",	#Hypertension"  
    "phecode_404", # Ischemic heart disease   
    "phecode_404-1", # Myocardial infarction [Heart attack]
    "phecode_431-11", # Cerebral infarction [Ischemic stroke]
    "phecode_424", # Heart failure

    
    "phecode_059-1", # COVID 19
    "phecode_468", # Pneumonia
    "phecode_474", # Chronic obstructive pulmonary disease [COPD]
      
    "phecode_286-2", #	Major depressive disorder
    "phecode_324-11", #Parkinson's Disease
    "phecode_328", # Dementias and cerebral degeneration

    
    "phecode_164", # Anemia
    "phecode_726-1", # Osteoporosis
    "phecode_371", # Cataract
    "phecode_374-42", # Diabetic retinopathy
    "phecode_374-5", # Macular degeneration
    "phecode_375-1", # Glaucoma
    
    
    "phecode_103", # Malignant neoplasm of the skin
    "phecode_101", # Malignant neoplasm of the digestive organs
    "phecode_102", # LUNG CANCER
    
    "phecode_583", # Chronic kidney disease    
    "phecode_542", # Chronic liver disease and sequelae    
    "OMOP_4306655"] # All-Cause Death

In [10]:
endpoint_frequencies = data_outcomes_long.query("event==1").value_counts("endpoint").to_frame("n_epic").merge(endpoints_md, how="left", left_index=True, right_index=True)

In [11]:
endpoint_frequencies.query("endpoint==@endpoint_selection")

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
phecode_401,2637,48533.0,11114.0,0.228999,401.0,Hypertension,Cardio,Both
phecode_371,1209,59805.0,6798.0,0.113669,371.0,Cataract,Eye,Both
phecode_404,969,58791.0,4055.0,0.068973,404.0,Ischemic heart disease,Cardio,Both
phecode_103,651,59332.0,3620.0,0.061013,103.0,Malignant neoplasm of the skin,Neoplasms,Both
phecode_164,648,57646.0,4945.0,0.085782,164.0,Anemia,Blood,Both
phecode_202,620,57936.0,4259.0,0.073512,202.0,Diabetes mellitus,Endo,Both
phecode_468,534,59697.0,3199.0,0.053587,468.0,Pneumonia,Resp,Both
phecode_583,485,59363.0,3264.0,0.054984,583.0,Chronic kidney disease,Genitourinary,Both
phecode_424,430,60846.0,1996.0,0.032804,424.0,Heart failure,Cardio,Both
phecode_286-2,329,52388.0,4781.0,0.091261,286.2,Major depressive disorder,Mental,Both


In [12]:
endpoint_frequencies.query("n_epic>20&phecode==phecode").sort_index().reset_index().to_feather(f"{base_path}/data/endpoints_epic_md.feather")

In [13]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints_md

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OMOP_4306655,111,61256.0,3548.0,0.057921,4306655.00,All-Cause Death,Death,Both
phecode_052,31,56105.0,3872.0,0.069013,52.00,Herpesvirus,ID,Both
phecode_052-3,28,57980.0,3342.0,0.057641,52.30,Varicella zoster virus,ID,Both
phecode_052-32,27,58885.0,3301.0,0.056058,52.32,Herpes zoster,ID,Both
phecode_061,40,59986.0,362.0,0.006035,61.00,Influenza virus,ID,Both
...,...,...,...,...,...,...,...,...
phecode_815,279,61028.0,560.0,0.009176,815.00,Symptoms and signs concerning food and fluid i...,Signs/Symptoms,Both
phecode_819,209,54957.0,10652.0,0.193824,819.00,General symptoms and other findings,Signs/Symptoms,Both
phecode_829,129,58248.0,4321.0,0.074183,829.00,Nonspecific findings on examination of blood,Signs/Symptoms,Both
phecode_848,85,58856.0,2765.0,0.046979,848.00,Nonspecific abnormal findings of other body st...,Signs/Symptoms,Both


In [14]:
data_images = pd.read_feather(f"{base_path}/data/data_images_230321.feather").rename(columns={"distfilename":"img_name"})
data_images

Unnamed: 0,eid,fundusid,img_name,side_ml,ml_glaucoma_gradability_grad,ml_glaucoma_gradability_wdiff,quality
0,RID00001,epiceye00650,0AIULA8E18C97Z2K_epiceye00650.png,Right,0.760914,0.226360,
1,RID00001,epiceye00650,0AIULA8E18C983T5_epiceye00650.png,Left,0.955591,0.043725,
2,RID00002,epiceye04756,0AIULA8E2ENQ92RP_epiceye04756.png,Left,0.986005,0.013981,
3,RID00002,epiceye04756,0AIULA8E2ENQ8YKN_epiceye04756.png,Right,0.943931,0.054606,
4,RID00003,epiceye01707,0AIULA8E149C9D03_epiceye01707.png,Right,0.981540,0.017813,
...,...,...,...,...,...,...,...
16037,RID07409,epiceye01738,0AIULA8E1ADD86BV_epiceye01738.png,Right,0.926931,0.072251,
16038,RID07410,epiceye05558,0AIULA8E2F62N2AI_epiceye05558.png,Right,0.054378,0.028145,Poor
16039,RID07410,epiceye05558,0AIULA8E2F62N5UZ_epiceye05558.png,Left,0.861651,0.133713,
16040,RID07411,epiceye04182,0AIULA8E1KAO83TM_epiceye04182.png,Right,0.988673,0.011106,


In [15]:
predictions = pd.read_feather(f"{base_path}/data/predictionstta_wide_230323.feather").sort_values("img_name")
predictions

Unnamed: 0,partition,img_name,iteration,OMOP_4306655,phecode_002,phecode_002-1,phecode_003,phecode_004,phecode_005,phecode_007,...,phecode_977,phecode_977-4,phecode_977-41,phecode_977-5,phecode_977-51,phecode_977-52,phecode_977-7,phecode_977-71,phecode_977-72,phecode_979
10430885,12,0AIULA8E0NV51BDR_epiceye07238.png,29,1.280273,2.730469,2.250000,2.904297,2.277344,0.553223,1.388672,...,1.080078,2.337891,2.304688,0.081726,-1.103516,-0.320068,0.668457,2.466797,0.606934,1.605469
33812819,19,0AIULA8E0NV51BDR_epiceye07238.png,95,1.199219,2.869141,2.384766,3.177734,2.380859,0.612793,1.291016,...,1.095703,2.460938,2.455078,0.614258,-0.616699,0.090393,0.607910,2.673828,0.494629,1.666016
24517739,11,0AIULA8E0NV51BDR_epiceye07238.png,69,1.319336,2.810547,2.304688,3.001953,2.359375,0.713867,1.322266,...,1.104492,2.353516,2.355469,0.021072,-1.285156,-0.350586,0.761719,2.593750,0.657227,1.750977
35191055,17,0AIULA8E0NV51BDR_epiceye07238.png,99,1.003906,2.613281,2.099609,2.951172,2.205078,0.677246,1.050781,...,0.958008,2.384766,2.398438,1.013672,-0.337646,0.531250,0.461426,2.406250,0.302490,1.605469
6969269,16,0AIULA8E0NV51BDR_epiceye07238.png,19,0.904785,2.431641,2.068359,2.761719,2.025391,0.534180,1.094727,...,0.918457,2.224609,2.240234,0.406738,-0.707031,-0.047668,0.415039,2.244141,0.344482,1.357422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4533795,18,0AIULA8E33581ADU_epiceye03802.png,12,-0.200684,1.663086,1.294922,2.121094,1.392578,0.176392,0.308838,...,0.590332,1.885742,1.885742,2.521484,1.698242,1.869141,-0.259277,1.598633,-0.358887,1.022461
25463751,4,0AIULA8E33581ADU_epiceye03802.png,72,-0.392334,1.401367,1.156250,2.177734,1.279297,0.123230,0.309082,...,0.512207,1.604492,1.596680,2.875000,2.136719,2.105469,-0.294434,1.529297,-0.447998,0.696289
33572907,4,0AIULA8E33581ADU_epiceye03802.png,95,-0.392822,1.380859,1.136719,2.162109,1.277344,0.132812,0.332764,...,0.498779,1.612305,1.606445,2.761719,2.027344,1.996094,-0.311279,1.493164,-0.460938,0.708496
28668951,6,0AIULA8E33581ADU_epiceye03802.png,81,0.064575,1.833984,1.494141,2.587891,1.647461,0.340820,0.563965,...,0.758301,1.944336,1.963867,3.251953,2.316406,2.615234,-0.146729,1.881836,-0.298584,0.989746


In [16]:
predictions_agg = predictions.groupby("img_name")[endpoints_md.index].mean().sort_index().reset_index()
predictions_agg

Unnamed: 0,img_name,OMOP_4306655,phecode_052,phecode_052-3,phecode_052-32,phecode_061,phecode_070,phecode_089,phecode_089-1,phecode_089-2,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,0AIULA8E0NV51BDR_epiceye07238.png,1.250430,0.904743,0.704174,0.601466,1.707630,0.676531,1.710261,1.086535,0.080808,...,2.102852,0.737877,0.721672,-0.150435,0.899138,2.384058,0.346769,1.413093,0.648661,1.742909
1,0AIULA8E0NV8EL29_epiceye03059.png,0.985030,0.934271,0.715693,0.617054,1.606891,0.634246,1.674194,0.924649,0.084884,...,1.972209,0.734779,0.591876,-0.162062,0.744131,2.274708,0.250907,1.251815,0.473214,1.590524
2,0AIULA8E0NV8ELNJ_epiceye03059.png,2.036294,1.080711,0.854566,0.762938,2.484882,1.160339,2.006146,1.833149,0.316328,...,2.801883,1.118522,1.199934,0.014818,1.498740,3.078325,0.863987,1.747250,1.140210,2.885452
3,0AIULA8E0OXLWXKO_epiceye01658.png,-0.248630,0.850174,0.562711,0.442252,1.380471,0.622844,1.569636,0.585199,0.100187,...,1.559489,0.675361,0.120407,-0.240933,0.119882,2.022638,-0.190412,0.561720,-0.073537,1.090932
4,0AIULA8E0SB0WTRT_epiceye04256.png,1.013236,0.929900,0.718426,0.613935,1.631787,0.649609,1.649335,0.905148,0.068500,...,1.926886,0.699310,0.582872,-0.196001,0.749671,2.216569,0.224692,1.212549,0.540368,1.423512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16021,0AIULA8E3357Z7FI_epiceye04112.png,-0.568300,0.635159,0.420249,0.291801,0.910476,0.041241,1.450257,0.309551,-0.060739,...,1.137419,0.346178,-0.002090,-0.395252,-0.258566,1.863422,-0.372055,0.786197,-0.318232,0.757490
16022,0AIULA8E33580BP5_epiceye03694.png,0.227253,0.954763,0.693600,0.586591,1.392737,0.631265,1.594106,0.652160,0.098398,...,1.680732,0.671826,0.302262,-0.198819,0.368561,2.004800,-0.053213,0.737203,0.189269,1.114401
16023,0AIULA8E33580HKJ_epiceye03694.png,0.327095,0.808091,0.585329,0.475553,1.332482,0.422166,1.584829,0.669156,0.027339,...,1.658821,0.563176,0.335925,-0.234013,0.344780,2.105219,-0.041355,1.018515,0.172577,1.282135
16024,0AIULA8E335815YK_epiceye03802.png,-0.196878,0.872132,0.608133,0.481012,1.277362,0.516716,1.529034,0.515255,0.059188,...,1.446831,0.597388,0.095294,-0.283274,0.081002,1.901214,-0.214429,0.612208,-0.081987,0.958602


In [17]:
data_predictions_all = data_baseline[["eid"]].merge(data_images, how="left", on="eid").merge(predictions_agg, how="left", on="img_name").reset_index(drop=True)

In [18]:
data_predictions_all

Unnamed: 0,eid,fundusid,img_name,side_ml,ml_glaucoma_gradability_grad,ml_glaucoma_gradability_wdiff,quality,OMOP_4306655,phecode_052,phecode_052-3,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,RID00001,epiceye00650,0AIULA8E18C97Z2K_epiceye00650.png,Right,0.760914,0.226360,,1.317749,1.114587,0.897538,...,2.368644,1.049419,0.796536,-0.078187,1.076583,2.436456,0.446262,1.025383,0.734533,2.060055
1,RID00001,epiceye00650,0AIULA8E18C983T5_epiceye00650.png,Left,0.955591,0.043725,,2.356176,1.275791,1.036786,...,3.235446,1.476009,1.437275,0.238013,1.881919,3.458896,1.138059,1.745204,1.388024,3.645285
2,RID00002,epiceye04756,0AIULA8E2ENQ92RP_epiceye04756.png,Left,0.986005,0.013981,,1.139290,1.035561,0.805338,...,2.130437,0.906943,0.602047,-0.077795,0.951870,2.362872,0.336029,1.019404,0.650525,1.751784
3,RID00002,epiceye04756,0AIULA8E2ENQ8YKN_epiceye04756.png,Right,0.943931,0.054606,,0.751222,1.054915,0.823314,...,1.975394,0.831329,0.483997,-0.158685,0.712518,1.986213,0.180313,0.647587,0.523878,1.295367
4,RID00003,epiceye01707,0AIULA8E149C9D03_epiceye01707.png,Right,0.981540,0.017813,,1.194235,1.103552,0.879866,...,2.232068,0.998687,0.702928,-0.103715,1.003654,2.312775,0.378078,0.933488,0.721325,1.777943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15706,RID07409,epiceye01738,0AIULA8E1ADD86BV_epiceye01738.png,Right,0.926931,0.072251,,1.806697,1.090313,0.871732,...,2.544142,1.112743,1.005581,0.010497,1.422392,2.727786,0.713430,1.449586,1.068018,2.401653
15707,RID07410,epiceye05558,0AIULA8E2F62N2AI_epiceye05558.png,Right,0.054378,0.028145,Poor,1.608046,1.031466,0.809817,...,2.565900,1.084438,0.927079,0.059094,1.386869,2.848601,0.652433,1.414737,0.962405,2.864641
15708,RID07410,epiceye05558,0AIULA8E2F62N5UZ_epiceye05558.png,Left,0.861651,0.133713,,2.230710,1.122038,0.881175,...,3.031094,1.336763,1.286941,0.141695,1.766954,3.349601,1.005676,1.775996,1.334383,3.463860
15709,RID07411,epiceye04182,0AIULA8E1KAO83TM_epiceye04182.png,Right,0.988673,0.011106,,0.372900,1.092676,0.845663,...,1.810774,0.841144,0.294215,-0.133096,0.584640,2.102329,0.069968,0.344106,0.449917,1.048863


In [20]:
data_predictions_qc = data_predictions_all.query("quality!='Poor'").groupby("eid")[endpoints_md.index].mean()

In [21]:
splits = pd.read_feather(f"{base_path}/data/splits_230321.feather")

In [22]:
predictions_clean = splits.merge(data_predictions_qc, how="left", left_on="eid", right_index=True)

In [23]:
predictions_clean

Unnamed: 0,partition,split,eid,OMOP_4306655,phecode_052,phecode_052-3,phecode_052-32,phecode_061,phecode_070,phecode_089,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,0,test,RID00011,0.974159,0.882430,0.670352,0.563170,1.556140,0.596321,1.663411,...,1.968946,0.690745,0.585570,-0.174243,0.724986,2.300610,0.210935,1.280175,0.473034,1.663874
1,0,test,RID00013,0.437074,0.812232,0.580275,0.466973,1.456630,0.541389,1.606705,...,1.744031,0.647063,0.319734,-0.200343,0.419151,2.154223,0.015853,0.967203,0.187895,1.347428
2,0,test,RID00019,1.574982,0.918246,0.714786,0.614410,1.986349,0.812530,1.808642,...,2.360059,0.851241,0.872776,-0.102057,1.121853,2.672771,0.521652,1.555427,0.816110,2.257434
3,0,test,RID00028,2.331847,1.132417,0.908059,0.814419,2.846271,1.436968,2.128679,...,3.060687,1.284923,1.361447,0.110824,1.768232,3.318809,1.052749,1.819558,1.345244,3.341218
4,0,test,RID00061,1.067389,1.065847,0.851983,0.740560,1.682662,0.914967,1.691930,...,2.117054,0.885513,0.670558,-0.214482,0.856786,2.075166,0.277365,0.925843,0.631648,1.495818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72475,9,train,RID07407,1.505541,1.004509,0.787599,0.682683,2.083855,1.011194,1.820936,...,2.463823,0.989010,0.878715,-0.104145,1.173768,2.582327,0.515076,1.341556,0.846422,2.319316
72476,9,train,RID07408,-0.188612,0.885886,0.591641,0.467072,1.454239,0.846932,1.591897,...,1.627818,0.798470,0.133533,-0.171545,0.235290,1.911081,-0.131402,0.427994,-0.110118,1.113012
72477,9,train,RID07409,1.790926,1.098324,0.859974,0.757435,2.430420,1.227740,1.892058,...,2.541275,1.126197,1.004223,0.019392,1.427817,2.785213,0.707000,1.475247,1.068137,2.486933
72478,9,train,RID07410,2.230710,1.122038,0.881175,0.789160,2.978818,1.475864,2.127971,...,3.031094,1.336763,1.286941,0.141695,1.766954,3.349601,1.005676,1.775996,1.334383,3.463860


In [24]:
model = "RetinaUKB"

for partition in range(10):
    temp_partition = predictions_clean.query("partition==@partition")
    for split in ["train", "test"]: 
        fp_out = f"{out_path}/{model}/{partition}"
        pathlib.Path(fp_out).mkdir(parents=True, exist_ok=True)
        t = temp_partition.query("split==@split").reset_index(drop=True)
        t.to_feather(f"{fp_out}/{split}.feather")
        print(f"{fp_out}/{split}.feather")

/home/jakobs/data/230323/loghs/RetinaUKB/0/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/0/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/1/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/1/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/2/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/2/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/3/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/3/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/4/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/4/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/5/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/5/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/6/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/6/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/7/train.feather
/home/jakobs/data/230323/loghs/RetinaUKB/7/test.feather
/home/jakobs/data/230323/loghs/RetinaUKB/8/train.feather
/home/jakobs/data/230323/loghs/RetinaUK

In [134]:
out_path

'/home/jakobs/data/230321/loghs'

In [20]:
1+1

2