## File paths and train,validation split

In [1]:
import pandas as pd
import tqdm
import numpy as np

labels_file = 'labels.csv'
ukb_field_to_icd10_map_file = 'icd10_codes_mod.tsv'
output_prefix = 'ukb_real'


## Read icd10 mapping file and defined index label link

In [2]:
icdict ={}
icdcodes = []
with open(ukb_field_to_icd10_map_file,'r') as f:
    for l in f:
        lvals=l.strip().split()
        icdict[lvals[0]]=lvals[5]
        icdcodes.append(lvals[5])

i = -1
label_dict = {}
with open(labels_file,'r') as f:
    for l in f:
        label_dict[l.strip().split(' ')[0]]=i
        i += 1

# hard coded sex and dob
icdict['f.31.0.0'] = "sex"
icdict['f.34.0.0'] = "YEAR"
icdict['f.52.0.0'] = "MONTH"
icdict['f.40000.0.0'] = "Death"

# cancer fields
for j in range(17):
    icdict['f.40005.'+str(j)+'.0'] = "cancer_date_"+str(j)
    icdict['f.40006.'+str(j)+'.0'] = "cancer_type_"+str(j)

# cancer hes fields 
#for j in range(213):
#    icdict['f.41270.0.'+str(j)] = "hicd_"+str(j)
#    icdict['f.41280.0.'+str(j)] = "hicd_date_"+str(j)

icdict['f.53.0.0'] = "assessment_date"
icdict['f.21001.0.0']="BMI"
icdict['f.1239.0.0']="smoking"
icdict['f.1558.0.0']="alcohol"

len_icd = len(icdcodes)
#icdcodes.extend(['Death','assessment_date']+['cancer_date_'+str(j) for j in range(17)]+['hicd_date_'+str(j) for j in range(213)])
icdcodes.extend(['Death','assessment_date']+['cancer_date_'+str(j) for j in range(17)])


## Read ukb basket file in chunks, select icd10 code occurance and dates, format for delphi

In [13]:
import polars as pl
import os
import tqdm

path = "/lustre/groups/shared/ukbb-87065/dataset/phenos"
out_path = "/lustre/groups/shared/ukbb-87065/users/lucas.arnoldt/"

eid = pl.read_csv(os.path.join(path, "eid.csv"))
eid_col = eid.columns[0]
eid_series = eid[eid_col]

def key_to_filename(key: str):
    _, a, b, c = key.split(".")
    return f"{a}-{b}.{c}.csv"

def rename_col(col: str):
    if col == "eid":
        return col
    a, rest = col.split("-")
    b, c = rest.split(".")
    return f"f.{a}.{b}.{c}"

phenos = []

for key in tqdm.tqdm(icdict.keys()):
    filename = key_to_filename(key)
    fullpath = os.path.join(path, filename)

    df = pl.read_csv(fullpath)

    pheno_col = df.columns[0]

    df = df.rename({pheno_col: rename_col(pheno_col)})

    phenos.append(df.lazy())

big = pl.concat(phenos, how="horizontal")

result = big.with_columns([
    eid_series.alias("eid")
]).select(["eid", *big.columns])

result.sink_csv(os.path.join(out_path, "merged_phenos.csv"))

100%|██████████| 1171/1171 [01:17<00:00, 15.02it/s]
  ]).select(["eid", *big.columns])


In [27]:
dd = pd.read_csv(os.path.join(out_path, "merged_phenos.csv"))
dd = dd.rename(columns={"eid": "f.eid"})
dd = dd.set_index("f.eid")

  dd = pd.read_csv(os.path.join(out_path, "merged_phenos.csv"))


In [29]:
data_list = []

dd = dd.rename(columns=icdict)
dd.dropna(subset=['sex'], inplace=True)
dd['sex'] += 1
dd = dd[[col for col in dd.columns if not col.startswith('f.')]]
dd['dob'] =  pd.to_datetime(dd[['YEAR', 'MONTH']].assign(DAY=1))
dd[icdcodes] = dd[icdcodes].apply(pd.to_datetime, format="%Y-%m-%d")
dd[icdcodes]=dd[icdcodes].sub(dd['dob'], axis=0)
dd[icdcodes]=dd[icdcodes].apply(lambda x : x.dt.days)

for col in icdcodes[:len_icd+1]:
    X = dd[col].dropna().reset_index().to_numpy().astype(int)
    data_list.append(np.hstack((X,label_dict[col]*np.ones([X.shape[0],1],X.dtype))))

X = dd['sex'].reset_index().to_numpy().astype(int)
data_list.append(np.c_[X[:,0],np.zeros(X.shape[0]),X[:,1]].astype(int))

for j in range(17):
    dd_cancer = dd[['cancer_date_'+str(j),'cancer_type_'+str(j)]].dropna().reset_index()
    if not dd_cancer.empty:
        dd_cancer['cancer'] = dd_cancer['cancer_type_'+str(j)].str.slice(0,3)
        dd_cancer['cancer_label'] = dd_cancer["cancer"].map(label_dict)
        data_list.append(dd_cancer[['f.eid','cancer_date_'+str(j),'cancer_label']].dropna().astype(int).to_numpy())

#for j in range(213):
#    dd_hicd = dd[['hicd_date_'+str(j),'hicd_'+str(j)]].dropna().reset_index()
#    if not dd_hicd.empty:
#        dd_hicd['hicd'] = dd_hicd['hicd_'+str(j)].str.slice(0,3)
#        dd_hicd['hicd_label'] = dd_hicd["hicd"].map(label_dict)
#        data_list.append(dd_hicd[['f.eid','hicd_date_'+str(j),'hicd_label']].dropna().astype(int).to_numpy())
    
dd_bmi = dd[['assessment_date','BMI']].dropna().reset_index()
dd_bmi['bmi_status'] = np.where(dd_bmi['BMI']>28,5,np.where(dd_bmi.BMI>22,4,3))
data_list.append(dd_bmi[['f.eid','assessment_date','bmi_status']].astype(int).to_numpy())

dd_sm = dd[['assessment_date','smoking']].dropna().reset_index()
dd_sm = dd_sm[dd_sm['smoking']!=-3]
dd_sm['smoking_status'] = np.where(dd_sm['smoking']==1,8,np.where(dd_sm.smoking==2,7,6))
data_list.append(dd_sm[['f.eid','assessment_date','smoking_status']].astype(int).to_numpy())

dd_al = dd[['assessment_date','alcohol']].dropna().reset_index()
dd_al = dd_al[dd_al['alcohol']!=-3]
dd_al['alcohol_status'] = np.where(dd_al['alcohol']==1,11,np.where(dd_al.alcohol < 4,10,9))
data_list.append(dd_al[['f.eid','assessment_date','alcohol_status']].astype(int).to_numpy())

## reformat, split train and val and output to delphi format

In [53]:
os.makedirs("../ukb_real_data/", exist_ok=True)

In [81]:
data = np.vstack(data_list)

data = data[np.lexsort((data[:,1], data[:,2]==data[:,2].max(), data[:,0]))]
data = data[data[:,1] >= 0]
data = pd.DataFrame(data).drop_duplicates([0,2]).values
data = data.astype(np.uint32)

data.tofile("../ukb_real_data/" + output_prefix + '.bin')

train_ids = pd.read_csv('/lustre/groups/shared/ukbb-87065/ukbb-found/data_benedikt/splits/train_ids_seed2024.csv')['id'].values.astype(np.uint32)
val_ids   = pd.read_csv('/lustre/groups/shared/ukbb-87065/ukbb-found/data_benedikt/splits/val_ids_seed2024.csv')['id'].values.astype(np.uint32)
test_ids  = pd.read_csv('/lustre/groups/shared/ukbb-87065/ukbb-found/data_benedikt/splits/test_ids_seed2024.csv')['id'].values.astype(np.uint32)

train_mask = np.isin(data[:,0], train_ids)
val_mask   = np.isin(data[:,0], val_ids)
test_mask  = np.isin(data[:,0], test_ids)

data[train_mask].tofile("../ukb_real_data/" + 'train.bin')
data[val_mask].tofile("../ukb_real_data/" + 'val.bin')
data[test_mask].tofile("../ukb_real_data/" + 'test.bin')

In [None]:
#conda create -p /lustre/groups/ml01/workspace/lucas.arnoldt/delphi python=3.12 -y
#conda activate delphi
#cd /ictstr01/groups/ml01/workspace/lucas.arnoldt/projects/Delphi/
#pip install -r requirements.txt
#Change:
#torch.backends.cuda.matmul.fp32_precision = 'tf32'
#torch.backends.cudnn.conv.fp32_precision = 'tf32'
#to:
#torch.backends.cuda.matmul.allow_tf32 = True
#torch.backends.cudnn.allow_tf32 = True

#python train.py config/train_delphi.py --out_dir=delphi_trained --device=cuda 
#python evaluate_auc.py --input_path=./data/ukb_real_data --model_ckpt_path=./delphi_trained/ckpt.pt --no_event_token_rate 5 --output_path delphi_trained_auc --dataset_subset_size 5000 

#pd.read_parquet("/ictstr01/groups/ml01/workspace/lucas.arnoldt/projects/Delphi/delphi_trained_auc/df_auc_unpooled.parquet")
#pd.read_parquet("/ictstr01/groups/ml01/workspace/lucas.arnoldt/projects/Delphi/delphi_trained_auc/df_both.parquet")