# 06 Convert exams to HDF5 and add metadata to files

for automated script see 06_to_hdf5.py

In [2]:
import nibabel as nib
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py
import glob
import pickle
from utils import get_best_mask

### Determine patient ids that we want to convert and save

Use these in 06_to_hdf5.py

In [2]:
#data_path = "/data/larson2/RCC_dl/data2022/"
#all_pids = os.listdir(data_path)
#print(len(all_pids))
#notreg_pids = set(pd.read_excel("not_registered_8_18_23.xlsx", engine='openpyxl', header=None).values.T.tolist()[0]) #take out pids that arent registered
#print(len(notreg_pids))
#pids = [p for p in all_pids if p not in notreg_pids]

# only convert exams that have been registered

#pids = pd.read_csv("./pids_registered_stacked_7_17_24.csv", header=None, names=["PID"]).values.T.tolist()[0]
#print(len(pids))   

pids = pd.read_csv("phase_reg_key_notreg.csv")["pid"].values.T.tolist()
print(len(pids))

269


In [3]:
# only convert exams that have labels

anon_labels=pd.read_excel("/working/larson2/ssahin/rcc_dl/spreadsheets/sage_all.xlsx",  engine='openpyxl') 
anon_labels.drop_duplicates(subset="Anon MRN", inplace=True)

anon_labels = anon_labels[["Orig MRN", "Anon MRN", "Orig Acc #"]]
anon_labels.rename(columns={"Orig MRN": "mrn"}, inplace=True)

df_labels=pd.read_csv("/data/larson2/RCC_dl/metadata/RenalMass_RedCap_Backup_2022-06-27_0933.csv") #has ~988 with labels
df_labels.dropna(subset=["accession", "tumor_type", "pathology"], inplace=True)
df_labels.drop_duplicates(subset="mrn", inplace=True)

df_labels = pd.merge(df_labels, anon_labels, how='outer', on='mrn', validate="1:1")


df_labels_pids = df_labels.loc[df_labels["Anon MRN"].isin(pids)]
df_labels_pids = df_labels_pids[['Anon MRN', 'tumor_type', 'pathology', 'grade']]
updated_pids = df_labels_pids["Anon MRN"].values.T.tolist()
print(len(updated_pids))

258


In [10]:
# pids not included
exc = [p for p in pids if p not in updated_pids]
print(exc)

['YdMUtxL088 ', 'uxwwIFPKGd ', 'XtP1qigqQC ', 'ERd6Ak32XT ', 'rqwAZBUHI7 ', '5WRrw1AwgJ ', 'ab4QoMEEcO ', 'PZJbKmDaJM ', 'Zaimur5bcI ', 'I06IcvD9ix ', 'vZp7c1E66e ']


In [12]:
# save updated pids list and dataframe to use in .py file

#print(updated_pids[:10])
with open('pids_reg_label_notreg_10312024.txt', 'wb') as f:
    pickle.dump(updated_pids, f)

df_labels_pids.to_pickle("./df_labels_pid_notreg_10312024.pkl") 

f.close()

### Converting not registered cases

In [None]:
# load in saved pids list and dataframe
with open('./pids_reg_label_notreg_10312024.txt', 'rb') as f:
    updated_pids = pickle.load(f)
df_labels_pids = pd.read_pickle("./df_labels_pid_notreg_10312024.pkl") 

phase_reg = pd.read_csv("phase_reg_key_notreg.csv")

save_dir ="/data/larson2/RCC_dl/hdf5_dir/"
data_path = "/data/larson2/RCC_dl/data2022/"
phase_fname_reg = {"noncon": "noncon_cropped.nii.gz","arterial": "arterial_reg.nii.gz", "delay": "delay_reg.nii.gz", "portven": "portven_reg.nii.gz"}
phase_fname_notreg = {"noncon": "noncon_cropped.nii.gz","arterial": "arterial.nii.gz", "delay": "delay.nii.gz", "portven": "portven.nii.gz"}

print(df_labels_pids.head())
print(phase_reg.head())
print(updated_pids[:5])

In [None]:
#updated_pids = ["0HW0RoqB2y"] #to test
istart = 257
#iend = 260

for pid in tqdm(updated_pids[istart:]):
    print("PID:", pid)
    # get reg status
    reg_stat = phase_reg.loc[phase_reg["pid"] == pid].to_dict('records')[0]

    # create and open hdf5 file
    h5_fname = os.path.join(save_dir,pid + ".hdf5")
    f = h5py.File(h5_fname, "w")

    # add metadata
    f.attrs["PID"] = pid
    f.attrs["tumor_type"] = df_labels_pids.loc[df_labels_pids["Anon MRN"] == pid]["tumor_type"].values[0]
    f.attrs["pathology"] = df_labels_pids.loc[df_labels_pids["Anon MRN"] == pid]["pathology"].values[0]
    f.attrs["pathology_grade"] = df_labels_pids.loc[df_labels_pids["Anon MRN"] == pid]["grade"].values[0]

    # add all registered phase images to hdf5
    for phase in phase_fname_reg:
        if phase =="noncon": ## need to catch not cropped noncon -- maybe copy them here
            if not os.path.exists(os.path.join(data_path,pid,phase_fname_reg[phase])):
                print("noncon isnt cropped")

                # load in nifti
                image = nib.load(os.path.join(data_path,pid,"noncon.nii.gz"))
                image_np = image.get_fdata()

                # add image to hdf5
                f.create_dataset(phase, data=image_np)

                # add pixel spacing
                f.attrs[phase+"_pixdim"] = image.header["pixdim"][1:4]

                masktag = "not_cropped"
                break

        if reg_stat[phase] > 0: #if phase exists
            # load in nifti
            if reg_stat[phase]==2: #if phase is registered
                image = nib.load(os.path.join(data_path,pid,phase_fname_reg[phase]))
            elif reg_stat[phase]==1: #if phase isnt registered
                image = nib.load(os.path.join(data_path,pid,phase_fname_notreg[phase]))
            image_np = image.get_fdata()

            # add image to hdf5
            f.create_dataset(phase, data=image_np)

            # add pixel spacing
            f.attrs[phase+"_pixdim"] = image.header["pixdim"][1:4]

            masktag = "cropped"



    # pick best mask and convert to hdf5
    mask_fname = get_best_mask(os.path.join(data_path,pid), masktag, reg_stat)
    # load in nifti
    image = nib.load(os.path.join(data_path,pid,mask_fname))
    mask_np = image.get_fdata()
    # threshold mask
    mask_np[mask_np < 0.5] = 0
    mask_np[mask_np >= 0.5] = 1

    # save image as hdf5
    f.create_dataset("mask", data=mask_np)

    # add mask pixel spacing
    f.attrs["mask_pixdim"] = image.header["pixdim"][1:4]

    f.close()

In [9]:
mask_fname = "tumor_R_portven.nii.gz"
image = nib.load(os.path.join(data_path,pid,mask_fname))
mask_np = image.get_fdata()
# threshold mask
mask_np[mask_np < 0.5] = 0
mask_np[mask_np >= 0.5] = 1

# save image as hdf5
f.create_dataset("mask", data=mask_np)

# add mask pixel spacing
f.attrs["mask_pixdim"] = image.header["pixdim"][1:4]

f.close()

In [37]:
f.close()

### clean up labels

In [11]:
hdf5_dir = "/data/ssahin/RCC_DL/multiphase/hdf5_dir"
files = glob.glob(os.path.join(hdf5_dir,"*.hdf5"))

for f in files:
    with h5py.File(f, "r") as hdf:
        print(hdf.attrs["pathology"])
        print(hdf.keys())

clear cell
<KeysViewHDF5 ['delay', 'mask', 'noncon', 'portven']>
clear cell
<KeysViewHDF5 ['delay', 'mask', 'noncon', 'portven']>
clear cell
<KeysViewHDF5 ['arterial', 'delay', 'mask', 'noncon']>
renal cell carcinoma, NOS
<KeysViewHDF5 ['mask', 'noncon', 'portven']>
clear cell
<KeysViewHDF5 ['delay', 'mask', 'noncon', 'portven']>
oncocytoma
<KeysViewHDF5 ['arterial', 'mask', 'noncon']>
clear cell
<KeysViewHDF5 ['delay', 'mask', 'noncon', 'portven']>
clear cell
<KeysViewHDF5 ['delay', 'mask', 'noncon']>
oncocytoma
<KeysViewHDF5 ['delay', 'mask', 'noncon', 'portven']>
papillary
<KeysViewHDF5 ['arterial', 'delay', 'mask', 'noncon', 'portven']>
arteriovenous malformation; prior hemorrhage with thrombosis
<KeysViewHDF5 ['arterial', 'mask', 'noncon', 'portven']>
renal cell carcinoma, NOS
<KeysViewHDF5 ['mask', 'noncon', 'portven']>
clear cell
<KeysViewHDF5 ['arterial', 'delay', 'mask', 'noncon']>
clear cell
<KeysViewHDF5 ['arterial', 'mask', 'noncon']>
papillary
<KeysViewHDF5 ['arterial', 'm

In [9]:
for f in files:
    with h5py.File(f, "a") as hdf:
        if hdf.attrs["pathology"] == "oncoctyoma":
            hdf.attrs["pathology"] = "oncocytoma"
        
        if hdf.attrs["pathology"] == "renal cell carcinoma NOS" or hdf.attrs["pathology"] == "renal cell carncinoma, NOS" or hdf.attrs["pathology"] == "RCC, low grade unclassified":
            hdf.attrs["pathology"] = "renal cell carcinoma, NOS"
        
        if hdf.attrs["pathology"] == "clear cell papillary" or hdf.attrs["pathology"] == "RCC with clear cell and papillary feature" or hdf.attrs["pathology"] == "clear cell papillary renal tumor" or hdf.attrs["pathology"] == "clear cell and papillary":
            hdf.attrs["pathology"] = "clear cell, papillary"

        if hdf.attrs["pathology"] == "multiloculated cyst" or hdf.attrs["pathology"] == "multilocular cyst":
            hdf.attrs["pathology"] = "multilocular cystic"

        if hdf.attrs["pathology"] == "clear cell, granular variant":
            hdf.attrs["pathology"] = "clear cell"


### debugging

In [1]:
with open('pids_reg_label_09142023.txt', 'rb') as f:
    test = pickle.load(f)

print(test[564:])
for i in test[564:]:
    print(i)

FileNotFoundError: [Errno 2] No such file or directory: 'pids_reg_label_09142023.txt'

In [None]:
# take out pids that dont have labels (for now)
df_labels=pd.read_csv("/data/larson2/RCC_dl/RCC_classification/metadata/filtered_features.csv") #maybe? 652 total, 457 matched
print(len(df_labels.index))
print(df_labels.head())

df_labels_pids = df_labels.loc[df_labels["image_accession"].isin(pids)]
print(len(df_labels_pids.index))
updated_pids = df_labels_pids["image_accession"].values.T.tolist()
print(len(updated_pids))

print(df_labels_pids.loc[df_labels_pids['image_accession'] == pid]["tumor_type"].values[0])

In [None]:
data_path = "/data/larson2/RCC_dl/data2022/"
pid = "YEh6FQv9h9"
fname = 'portven_reg.nii.gz'

image = nib.load(os.path.join(data_path,pid,fname))
image_np = image.get_fdata()


# save image as hdf5
h5_fname = os.path.join(data_path,pid,fname[:-7] + '.hdf5')
f = h5py.File(h5_fname, "w")
f.create_dataset("image", data=image_np)
f.close()

In [None]:
f = h5py.File(os.path.join(data_path,'oo2Q4ig2bL','oo2Q4ig2bL.hdf5'), 'r')
print(f.attrs.keys())
print(f.keys())
f.close()

In [None]:

f = h5py.File(os.path.join(data_path,pid,pid + '.hdf5'), 'r')
print(f.attrs.keys())
print(f.keys())
print(f.attrs['PID'])
print(f.attrs['pathology_grade'])
print(f.attrs['noncon_pixdim'])
print(f.attrs['portven_pixdim'])
print(f['noncon'])
print(f['portven'])
image1 = f['portven']
image2 = f['mask']

import matplotlib.pyplot as plt

fig, (ax1) = plt.subplots(figsize=(20,10), ncols=1, nrows=1)  

img1 = ax1.imshow(image1[:, :, 40], cmap="Greys_r", vmin=-200, vmax=800)
ax1.set_title('Axial Slice 1')
plt.show()

fig, (ax1) = plt.subplots(figsize=(20,10), ncols=1, nrows=1)  

img1 = ax1.imshow(image2[:, :, 40], cmap="Greys_r", vmin=-200, vmax=800)
ax1.set_title('Axial Slice 1')
plt.show()

f.close()

In [None]:
fname = 'delay_reg.nii.gz'
f = h5py.File(os.path.join(data_path,pid,fname[:-7] + '.hdf5'), 'r')
print(f['image'])
image1 = f["image"]

import matplotlib.pyplot as plt

fig, (ax1) = plt.subplots(figsize=(20,10), ncols=1, nrows=1)  

img1 = ax1.imshow(image1[:, :, 40], cmap="Greys_r", vmin=-200, vmax=800)
ax1.set_title('Axial Slice 1')
plt.show()

In [None]:
image = nib.load("/data/larson2/RCC_dl/data2022/wO8XGXOZvV/noncon.nii.gz")
image_np = image.get_fdata()

In [None]:
df_labels_pids = pd.read_csv("./pids_registered_stacked_7_17_24.csv", header=None, names=["PID"])   
print(df_labels_pids.head())

save_dir ="/data/larson2/RCC_dl/hdf5_dir/"
data_path = "/data/larson2/RCC_dl/data2022/"
phase_fname = {"noncon": "noncon_cropped.nii.gz","arterial": "arterial_reg.nii.gz", "delay": "delay_reg.nii.gz", "portven": "portven_reg.nii.gz"}

#updated_pids = ["ONJbX3HplS", "cOl6ZUqTXr", "oo2Q4ig2bL"] #to test
istart = 0
iend = 5

for pid in tqdm(df_labels_pids.PID[istart:iend]):
    print(pid)