In [2]:
import numpy as np
import pandas as pd
import pathlib
import os
from tqdm.auto import tqdm


from omegaconf import DictConfig, OmegaConf

import torch
#from torch_geometric import seed_everything

import ray

In [3]:
base_path = "/home/jakobs"

project_path = f"{base_path}/data"

experiment = '230629'
experiment_path = f"{project_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

partitions = [i for i in range(22)]

## Process Predictions

In [4]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_md.feather").set_index("endpoint")
endpoints_md

Unnamed: 0_level_0,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OMOP_4306655,61256,3548,0.057921,4306655.00,All-Cause Death,Death,Both
phecode_002,60945,658,0.010797,2.00,Staphylococcus,ID,Both
phecode_002-1,61010,486,0.007966,2.10,Staphylococcus aureus,ID,Both
phecode_003,60757,1017,0.016739,3.00,Escherichia coli,ID,Both
phecode_004,60584,494,0.008154,4.00,Streptococcus,ID,Both
...,...,...,...,...,...,...,...
phecode_977-52,31669,520,0.016420,977.52,Hormone replacement therapy (postmenopausal),Rx,Female
phecode_977-7,60032,2231,0.037164,977.70,Long term (current) use of insulin or oral hyp...,Rx,Both
phecode_977-71,60936,472,0.007746,977.71,Long term (current) use of insulin,Rx,Both
phecode_977-72,60207,2148,0.035677,977.72,Long term (current) use of oral hypoglycemic d...,Rx,Both


In [5]:
id_vars = ["eid", "model", "partition", "split"]

In [6]:
out_path = f"{experiment_path}/loghs"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [7]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")

In [8]:
data_outcomes_long = pd.read_feather(f"{base_path}/data/data_outcomes_long_230320.feather")

In [9]:
data_outcomes_long.query("event==1").sample(10)

Unnamed: 0,eid,endpoint,prev,event,time
2968207,RID03859,phecode_416,0.0,1.0,0.887064
428191,RID00572,phecode_101-4,0.0,1.0,7.367899
4855529,RID06762,phecode_670,0.0,1.0,0.251882
2987754,RID01612,phecode_416-4,0.0,1.0,10.053388
1610483,RID01457,phecode_244,0.0,1.0,6.116359
2673911,RID06793,phecode_388,0.0,1.0,7.78371
4264777,RID03018,phecode_582,0.0,1.0,11.734428
1621491,RID05304,phecode_247,0.0,1.0,3.405886
3482404,RID03435,phecode_474,0.0,1.0,2.043806
918670,RID05546,phecode_130,0.0,1.0,5.049966


In [10]:
endpoint_selection = [
    # generally very important
    "phecode_202", # Diabetes mellitus
    "phecode_401",	#Hypertension"  
    "phecode_404", # Ischemic heart disease   
    "phecode_404-1", # Myocardial infarction [Heart attack]
    "phecode_431-11", # Cerebral infarction [Ischemic stroke]
    "phecode_424", # Heart failure

    
    "phecode_059-1", # COVID 19
    "phecode_468", # Pneumonia
    "phecode_474", # Chronic obstructive pulmonary disease [COPD]
      
    "phecode_286-2", #	Major depressive disorder
    "phecode_324-11", #Parkinson's Disease
    "phecode_328", # Dementias and cerebral degeneration

    
    "phecode_164", # Anemia
    "phecode_726-1", # Osteoporosis
    "phecode_371", # Cataract
    "phecode_374-42", # Diabetic retinopathy
    "phecode_374-5", # Macular degeneration
    "phecode_375-1", # Glaucoma
    
    
    "phecode_103", # Malignant neoplasm of the skin
    "phecode_101", # Malignant neoplasm of the digestive organs
    "phecode_102", # LUNG CANCER
    
    "phecode_583", # Chronic kidney disease    
    "phecode_542", # Chronic liver disease and sequelae    
    "OMOP_4306655"] # All-Cause Death

In [11]:
endpoint_frequencies = data_outcomes_long.query("event==1").value_counts("endpoint").to_frame("n_epic").merge(endpoints_md, how="left", left_index=True, right_index=True)

In [12]:
endpoint_frequencies.query("endpoint==@endpoint_selection")

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
phecode_401,2637,48533.0,11114.0,0.228999,401.0,Hypertension,Cardio,Both
phecode_371,1209,59805.0,6798.0,0.113669,371.0,Cataract,Eye,Both
phecode_404,969,58791.0,4055.0,0.068973,404.0,Ischemic heart disease,Cardio,Both
phecode_103,651,59332.0,3620.0,0.061013,103.0,Malignant neoplasm of the skin,Neoplasms,Both
phecode_164,648,57646.0,4945.0,0.085782,164.0,Anemia,Blood,Both
phecode_202,620,57936.0,4259.0,0.073512,202.0,Diabetes mellitus,Endo,Both
phecode_468,534,59697.0,3199.0,0.053587,468.0,Pneumonia,Resp,Both
phecode_583,485,59363.0,3264.0,0.054984,583.0,Chronic kidney disease,Genitourinary,Both
phecode_424,430,60846.0,1996.0,0.032804,424.0,Heart failure,Cardio,Both
phecode_286-2,329,52388.0,4781.0,0.091261,286.2,Major depressive disorder,Mental,Both


In [13]:
endpoint_frequencies.query("n_epic>20&phecode==phecode").sort_index().reset_index().to_feather(f"{base_path}/data/endpoints_epic_md.feather")

In [14]:
endpoints_md = pd.read_feather(f"{base_path}/data/endpoints_epic_md.feather").set_index("endpoint")
endpoints_md

Unnamed: 0_level_0,n_epic,eligable,n,freq,phecode,phecode_string,phecode_category,sex
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OMOP_4306655,111,61256.0,3548.0,0.057921,4306655.00,All-Cause Death,Death,Both
phecode_052,31,56105.0,3872.0,0.069013,52.00,Herpesvirus,ID,Both
phecode_052-3,28,57980.0,3342.0,0.057641,52.30,Varicella zoster virus,ID,Both
phecode_052-32,27,58885.0,3301.0,0.056058,52.32,Herpes zoster,ID,Both
phecode_061,40,59986.0,362.0,0.006035,61.00,Influenza virus,ID,Both
...,...,...,...,...,...,...,...,...
phecode_815,279,61028.0,560.0,0.009176,815.00,Symptoms and signs concerning food and fluid i...,Signs/Symptoms,Both
phecode_819,209,54957.0,10652.0,0.193824,819.00,General symptoms and other findings,Signs/Symptoms,Both
phecode_829,129,58248.0,4321.0,0.074183,829.00,Nonspecific findings on examination of blood,Signs/Symptoms,Both
phecode_848,85,58856.0,2765.0,0.046979,848.00,Nonspecific abnormal findings of other body st...,Signs/Symptoms,Both


In [15]:
data_images = pd.read_feather(f"{base_path}/data/data_images_230321.feather").rename(columns={"distfilename":"img_name"})
data_images

Unnamed: 0,eid,fundusid,img_name,side_ml,ml_glaucoma_gradability_grad,ml_glaucoma_gradability_wdiff,quality
0,RID00001,epiceye00650,0AIULA8E18C97Z2K_epiceye00650.png,Right,0.760914,0.226360,
1,RID00001,epiceye00650,0AIULA8E18C983T5_epiceye00650.png,Left,0.955591,0.043725,
2,RID00002,epiceye04756,0AIULA8E2ENQ92RP_epiceye04756.png,Left,0.986005,0.013981,
3,RID00002,epiceye04756,0AIULA8E2ENQ8YKN_epiceye04756.png,Right,0.943931,0.054606,
4,RID00003,epiceye01707,0AIULA8E149C9D03_epiceye01707.png,Right,0.981540,0.017813,
...,...,...,...,...,...,...,...
16037,RID07409,epiceye01738,0AIULA8E1ADD86BV_epiceye01738.png,Right,0.926931,0.072251,
16038,RID07410,epiceye05558,0AIULA8E2F62N2AI_epiceye05558.png,Right,0.054378,0.028145,Poor
16039,RID07410,epiceye05558,0AIULA8E2F62N5UZ_epiceye05558.png,Left,0.861651,0.133713,
16040,RID07411,epiceye04182,0AIULA8E1KAO83TM_epiceye04182.png,Right,0.988673,0.011106,


In [16]:
predictions = pd.read_feather(f"{base_path}/data/predictionstta_wide_230629.feather").sort_values("img_name")
predictions

Unnamed: 0,partition,img_name,iteration,OMOP_4306655,phecode_002,phecode_002-1,phecode_003,phecode_004,phecode_005,phecode_007,...,phecode_977,phecode_977-4,phecode_977-41,phecode_977-5,phecode_977-51,phecode_977-52,phecode_977-7,phecode_977-71,phecode_977-72,phecode_979
6392333,7,0AIULA8E0NV51BDR_epiceye07238.png,66,1.554688,2.957031,2.583984,2.931641,2.328125,0.645508,1.393555,...,1.196289,2.210938,2.220703,-0.078918,-1.443359,-0.283203,1.039062,2.667969,0.944336,1.952148
1568507,5,0AIULA8E0NV51BDR_epiceye07238.png,16,0.915039,1.794922,1.447266,1.781250,1.344727,-0.233032,0.519531,...,0.554688,1.335938,1.331055,-0.259521,-0.875488,-0.669434,0.375488,1.500000,0.355957,1.035156
8107115,5,0AIULA8E0NV51BDR_epiceye07238.png,84,1.030273,1.909180,1.553711,1.883789,1.402344,-0.187012,0.540527,...,0.594238,1.366211,1.365234,-0.373535,-1.064453,-0.759766,0.429688,1.590820,0.404785,1.159180
1600559,9,0AIULA8E0NV51BDR_epiceye07238.png,16,1.244141,2.349609,2.072266,2.154297,1.725586,0.214233,1.257812,...,0.868652,1.641602,1.669922,-0.869629,-1.704102,-1.022461,0.787598,1.941406,0.720215,1.236328
6985295,9,0AIULA8E0NV51BDR_epiceye07238.png,72,0.955566,2.080078,1.827148,1.899414,1.509766,0.062012,1.055664,...,0.691895,1.503906,1.533203,-0.617676,-1.292969,-0.843750,0.568359,1.594727,0.511230,1.000977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6665253,5,0AIULA8E33581ADU_epiceye03802.png,69,-0.300049,0.940918,0.687988,1.137695,0.816895,-0.323242,0.369385,...,0.222534,1.065430,1.038086,1.520508,1.191406,0.985840,-0.340820,0.859375,-0.372314,0.299561
511269,5,0AIULA8E33581ADU_epiceye03802.png,5,-0.285889,0.945312,0.691406,1.148438,0.826172,-0.307129,0.375488,...,0.227417,1.067383,1.041992,1.552734,1.205078,1.031250,-0.340088,0.865723,-0.371826,0.303955
3027351,7,0AIULA8E33581ADU_epiceye03802.png,31,-0.220825,1.366211,1.075195,1.848633,0.960449,-0.110229,0.323975,...,0.460205,1.736328,1.722656,2.533203,1.955078,1.900391,-0.382324,1.064453,-0.489258,0.611816
7210137,20,0AIULA8E33581ADU_epiceye03802.png,74,-0.440918,1.119141,0.851074,1.694336,1.102539,0.099609,0.233032,...,0.432373,1.380859,1.376953,2.810547,2.210938,2.236328,-0.482178,0.975586,-0.618164,0.679199


In [17]:
predictions_agg = predictions.groupby("img_name")[endpoints_md.index].mean().sort_index().reset_index()
predictions_agg

Unnamed: 0,img_name,OMOP_4306655,phecode_052,phecode_052-3,phecode_052-32,phecode_061,phecode_070,phecode_089,phecode_089-1,phecode_089-2,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,0AIULA8E0NV51BDR_epiceye07238.png,1.152122,0.619561,0.494284,0.416791,1.281541,0.455238,1.205841,0.887489,0.030358,...,1.557669,0.469634,0.642598,-0.099779,0.717516,1.750422,0.359955,1.153481,0.611892,1.358125
1,0AIULA8E0NV8EL29_epiceye03059.png,0.978151,0.630411,0.494341,0.413438,1.198184,0.421193,1.169047,0.735534,0.019482,...,1.457587,0.466171,0.531000,-0.098991,0.611377,1.692357,0.298177,1.043476,0.482985,1.259256
2,0AIULA8E0NV8ELNJ_epiceye03059.png,1.695091,0.705417,0.582027,0.489227,1.810161,0.782575,1.359416,1.337329,0.147735,...,1.942773,0.727365,0.970470,0.016611,1.153981,2.156032,0.691570,1.390539,0.996386,2.082702
3,0AIULA8E0OXLWXKO_epiceye01658.png,-0.635878,0.557171,0.301613,0.199206,0.866935,0.458823,1.051277,0.292479,0.060907,...,0.953971,0.415734,-0.090266,-0.242658,-0.193588,1.237041,-0.247636,0.123448,-0.265053,0.538169
4,0AIULA8E0SB0WTRT_epiceye04256.png,0.876212,0.640482,0.502252,0.423725,1.087242,0.392654,1.158087,0.680051,0.031741,...,1.397672,0.410283,0.490581,-0.152849,0.524115,1.551693,0.227239,0.972378,0.472172,1.040830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16021,0AIULA8E3357Z7FI_epiceye04112.png,-0.581431,0.339943,0.162863,0.040903,0.440448,-0.229816,0.948834,0.073249,-0.107568,...,0.738342,0.041241,-0.083775,-0.392848,-0.469761,1.242431,-0.357722,0.493159,-0.351861,0.355384
16022,0AIULA8E33580BP5_epiceye03694.png,-0.022110,0.614696,0.426838,0.337240,0.826384,0.326444,1.069722,0.353818,0.025674,...,1.046188,0.325215,0.133474,-0.221606,0.054867,1.296820,-0.099022,0.512135,0.026296,0.644033
16023,0AIULA8E33580HKJ_epiceye03694.png,0.074906,0.493467,0.323401,0.233041,0.811479,0.170157,1.064986,0.375989,-0.025704,...,1.070255,0.251681,0.156783,-0.240426,0.041022,1.397931,-0.092470,0.661570,0.000924,0.751828
16024,0AIULA8E335815YK_epiceye03802.png,-0.267852,0.563629,0.367545,0.269997,0.714584,0.215135,1.017214,0.234751,-0.009486,...,0.915639,0.271122,0.037472,-0.277108,-0.130851,1.167481,-0.191160,0.458424,-0.119169,0.516942


In [18]:
data_predictions_all = data_baseline[["eid"]].merge(data_images, how="left", on="eid").merge(predictions_agg, how="left", on="img_name").reset_index(drop=True)

In [19]:
data_predictions_all

Unnamed: 0,eid,fundusid,img_name,side_ml,ml_glaucoma_gradability_grad,ml_glaucoma_gradability_wdiff,quality,OMOP_4306655,phecode_052,phecode_052-3,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,RID00001,epiceye00650,0AIULA8E18C97Z2K_epiceye00650.png,Right,0.760914,0.226360,,1.161929,0.842503,0.724875,...,1.774966,0.778805,0.782289,-0.018927,0.925395,1.693075,0.450632,0.789768,0.741762,1.498957
1,RID00001,epiceye00650,0AIULA8E18C983T5_epiceye00650.png,Left,0.955591,0.043725,,2.085479,0.921009,0.774051,...,2.478905,1.208280,1.343831,0.266498,1.739141,2.661221,1.055053,1.431580,1.433085,3.125399
2,RID00002,epiceye04756,0AIULA8E2ENQ92RP_epiceye04756.png,Left,0.986005,0.013981,,0.916981,0.722393,0.572976,...,1.496328,0.539610,0.495119,-0.076082,0.647641,1.645129,0.285480,0.848866,0.520053,1.284900
3,RID00002,epiceye04756,0AIULA8E2ENQ8YKN_epiceye04756.png,Right,0.943931,0.054606,,0.681868,0.781068,0.629728,...,1.425725,0.575981,0.459583,-0.096552,0.539627,1.414370,0.195589,0.566625,0.457895,1.039563
4,RID00003,epiceye01707,0AIULA8E149C9D03_epiceye01707.png,Right,0.981540,0.017813,,0.932515,0.781802,0.634375,...,1.558000,0.612603,0.587644,-0.086130,0.696476,1.618223,0.304534,0.733557,0.599397,1.240693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15706,RID07409,epiceye01738,0AIULA8E1ADD86BV_epiceye01738.png,Right,0.926931,0.072251,,1.382362,0.737661,0.599050,...,1.738607,0.691283,0.771512,-0.002394,0.997187,1.864155,0.513594,1.096260,0.847049,1.693337
15707,RID07410,epiceye05558,0AIULA8E2F62N2AI_epiceye05558.png,Right,0.054378,0.028145,Poor,1.470806,0.761277,0.630946,...,1.896834,0.771846,0.840374,0.073863,1.153735,2.127007,0.620154,1.106350,0.854534,2.268866
15708,RID07410,epiceye05558,0AIULA8E2F62N5UZ_epiceye05558.png,Left,0.861651,0.133713,,1.658586,0.779961,0.641285,...,2.019072,0.869569,0.979780,0.095714,1.268140,2.233407,0.708515,1.245212,1.030786,2.353768
15709,RID07411,epiceye04182,0AIULA8E1KAO83TM_epiceye04182.png,Right,0.988673,0.011106,,0.370056,0.778698,0.604673,...,1.256229,0.505826,0.290212,-0.134613,0.355900,1.329143,0.081698,0.383881,0.322182,0.713418


In [20]:
data_predictions_qc = data_predictions_all.query("quality!='Poor'").groupby("eid")[endpoints_md.index].mean()

In [21]:
splits = pd.read_feather(f"{base_path}/data/splits_230321.feather")

In [22]:
predictions_clean = splits.merge(data_predictions_qc, how="left", left_on="eid", right_index=True)

In [23]:
predictions_clean

Unnamed: 0,partition,split,eid,OMOP_4306655,phecode_052,phecode_052-3,phecode_052-32,phecode_061,phecode_070,phecode_089,...,phecode_805,phecode_807,phecode_808,phecode_809,phecode_812,phecode_815,phecode_819,phecode_829,phecode_848,phecode_979
0,0,test,RID00011,0.920107,0.609725,0.473195,0.392524,1.151024,0.381939,1.164801,...,1.434371,0.424233,0.518362,-0.126159,0.559940,1.651957,0.248817,1.038141,0.461589,1.222668
1,0,test,RID00013,0.166866,0.498391,0.332256,0.243513,0.830356,0.153377,1.068734,...,1.098557,0.270998,0.178845,-0.228336,0.077102,1.416418,-0.053360,0.718035,0.029681,0.777224
2,0,test,RID00019,1.180717,0.615814,0.501079,0.416221,1.259379,0.412984,1.201780,...,1.558032,0.469675,0.647966,-0.105366,0.716693,1.776197,0.369375,1.181938,0.595590,1.407502
3,0,test,RID00028,1.663356,0.733618,0.612397,0.523316,1.881624,0.878097,1.376335,...,1.953146,0.772012,0.954678,0.040303,1.182387,2.155947,0.672043,1.298690,0.998859,2.134889
4,0,test,RID00061,0.821737,0.736347,0.595336,0.530758,1.179804,0.567481,1.156265,...,1.464097,0.528390,0.564782,-0.161170,0.549222,1.447542,0.237777,0.737768,0.517570,0.981726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72475,9,train,RID07407,0.884741,0.663421,0.524111,0.445316,1.221537,0.524451,1.170540,...,1.501533,0.498730,0.571464,-0.144724,0.590110,1.611845,0.255397,0.913965,0.531675,1.208119
72476,9,train,RID07408,-0.088974,0.645230,0.422308,0.327551,1.044865,0.589903,1.112473,...,1.170047,0.518900,0.085566,-0.128972,0.146478,1.308545,-0.041111,0.318563,-0.024351,0.815033
72477,9,train,RID07409,1.308521,0.740145,0.595622,0.518504,1.601039,0.767965,1.264668,...,1.688726,0.670103,0.724852,-0.003597,0.947376,1.834036,0.475000,1.062920,0.788036,1.635073
72478,9,train,RID07410,1.658586,0.779961,0.641285,0.554511,2.031056,1.012919,1.413065,...,2.019072,0.869569,0.979780,0.095714,1.268140,2.233407,0.708515,1.245212,1.030786,2.353768


In [24]:
model = "RetinaUKB"

for partition in range(10):
    temp_partition = predictions_clean.query("partition==@partition")
    for split in ["train", "test"]: 
        fp_out = f"{out_path}/{model}/{partition}"
        pathlib.Path(fp_out).mkdir(parents=True, exist_ok=True)
        t = temp_partition.query("split==@split").reset_index(drop=True)
        t.to_feather(f"{fp_out}/{split}.feather")
        print(f"{fp_out}/{split}.feather")

/home/jakobs/data/230629/loghs/RetinaUKB/0/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/0/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/1/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/1/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/2/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/2/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/3/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/3/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/4/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/4/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/5/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/5/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/6/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/6/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/7/train.feather
/home/jakobs/data/230629/loghs/RetinaUKB/7/test.feather
/home/jakobs/data/230629/loghs/RetinaUKB/8/train.feather
/home/jakobs/data/230629/loghs/RetinaUK

In [134]:
out_path

'/home/jakobs/data/230321/loghs'

In [20]:
1+1

2