In [1]:
import numpy as np
import pandas as pd
import SimpleITK as sitk
from tqdm import tqdm
import scipy.ndimage
import cv2
import os

import sys

# Get the home directory
home_directory = os.path.expanduser('~')

# Append the parent directory to the Python path
sys.path.append(os.path.join(home_directory, 'VSCode', 'UCAN-PET-CT-image-data-handling-pipeline'))

from Utils import utils

def display_full(x):
    with pd.option_context("display.max_rows", None,
                           "display.max_columns", None,
                           "display.width", 20000,
                           "display.max_colwidth", None,
                           ):
        print(x)

collages_path = "/media/andres/T7 Shield1/UCAN_project/collages/"
df_of_reshaped_projections = pd.read_excel("/media/andres/T7 Shield1/UCAN_project/df_of_reshaped_projections.xlsx")
df_of_reshaped_projections.head()

df_of_reshaped_projections["scan_date"] = df_of_reshaped_projections["scan_date"].astype(str)
df_of_reshaped_projections["unique_pat_ID_scan_date"] = df_of_reshaped_projections["patient_ID"] + "_" + df_of_reshaped_projections["scan_date"]
unique_patient = np.unique(df_of_reshaped_projections["unique_pat_ID_scan_date"])

for scan_date in tqdm(unique_patient):
    temp = df_of_reshaped_projections[df_of_reshaped_projections["unique_pat_ID_scan_date"] == scan_date]
    save_path = os.path.join(collages_path, str(temp["patient_ID"].iloc[0]), str(temp["scan_date"].iloc[0]))

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    SUV_MIP_list = []
    SUV_bone_list = []
    SUV_lean_list = []
    SUV_adipose_list = []
    SUV_air_list = []

    CT_MIP_list = []
    CT_bone_list = []
    CT_lean_list = []
    CT_adipose_list = []
    CT_air_list = []

    for idx, row in temp.iterrows():
        SUV_MIP_list.append(np.load(row["SUV_MIP"]))
        SUV_bone_list.append(np.load(row["SUV_bone"]))
        SUV_lean_list.append(np.load(row["SUV_lean"]))
        SUV_adipose_list.append(np.load(row["SUV_adipose"]))
        SUV_air_list.append(np.load(row["SUV_air"]))
        
        CT_MIP_list.append(np.load(row["CT_MIP"]))
        CT_bone_list.append(np.load(row["CT_bone"]))
        CT_lean_list.append(np.load(row["CT_lean"]))
        CT_adipose_list.append(np.load(row["CT_adipose"]))
        CT_air_list.append(np.load(row["CT_air"]))
    
    SUV_MIP = np.concatenate((SUV_MIP_list[0], SUV_MIP_list[1]), axis=1)
    np.save(os.path.join(save_path, "SUV_MIP.npy"), SUV_MIP)

    SUV_bone = np.concatenate((SUV_bone_list[0], SUV_bone_list[1]), axis=1)
    np.save(os.path.join(save_path, "SUV_bone.npy"), SUV_bone)

    SUV_lean = np.concatenate((SUV_lean_list[0], SUV_lean_list[1]), axis=1)
    np.save(os.path.join(save_path, "SUV_lean.npy"), SUV_lean)

    SUV_adipose = np.concatenate((SUV_adipose_list[0], SUV_adipose_list[1]), axis=1)
    np.save(os.path.join(save_path, "SUV_adipose.npy"), SUV_adipose)

    SUV_air = np.concatenate((SUV_air_list[0], SUV_air_list[1]), axis=1)
    np.save(os.path.join(save_path, "SUV_air.npy"), SUV_air)

    CT_MIP = np.concatenate((CT_MIP_list[0], CT_MIP_list[1]), axis=1)
    np.save(os.path.join(save_path, "CT_MIP.npy"), CT_MIP)

    CT_bone = np.concatenate((CT_bone_list[0], CT_bone_list[1]), axis=1)
    np.save(os.path.join(save_path, "CT_bone.npy"), CT_bone)

    CT_lean = np.concatenate((CT_lean_list[0], CT_lean_list[1]), axis=1)
    np.save(os.path.join(save_path, "CT_lean.npy"), CT_lean)

    CT_adipose = np.concatenate((CT_adipose_list[0], CT_adipose_list[1]), axis=1)
    np.save(os.path.join(save_path, "CT_adipose.npy"), CT_adipose)

    CT_air = np.concatenate((CT_air_list[0], CT_air_list[1]), axis=1)
    np.save(os.path.join(save_path, "CT_air.npy"), CT_air)

100%|██████████| 1823/1823 [06:13<00:00,  4.88it/s]


In [64]:
df_of_collages = pd.DataFrame(columns=["patient_ID", "scan_date", "SUV_MIP", "SUV_bone", "SUV_lean", "SUV_adipose", "SUV_air"])
for patient_ID in tqdm(sorted(os.listdir(collages_path))):
    for scan_date in sorted(os.listdir(os.path.join(collages_path, patient_ID))):
        for angle in ["-90.0", "0.0"]:
            SUV_MIP_path = os.path.join(collages_path, patient_ID, scan_date, "SUV_MIP" + ".npy")
            SUV_bone_path = os.path.join(collages_path, patient_ID, scan_date, "SUV_bone" + ".npy")
            SUV_lean_path = os.path.join(collages_path, patient_ID, scan_date, "SUV_lean" + ".npy")
            SUV_adipose_path = os.path.join(collages_path, patient_ID, scan_date, "SUV_adipose" + ".npy")
            SUV_air_path = os.path.join(collages_path, patient_ID, scan_date, "SUV_air" + ".npy")
            df_temp = pd.DataFrame({"patient_ID": [patient_ID], "scan_date": [scan_date], "SUV_MIP": [SUV_MIP_path], "SUV_bone": [SUV_bone_path], "SUV_lean": [SUV_lean_path], "SUV_adipose": [SUV_adipose_path], "SUV_air": [SUV_air_path]})
            df_of_collages = pd.concat([df_of_collages, df_temp], ignore_index=True)

df_of_collages["CT_MIP"] = df_of_collages["SUV_MIP"]
df_of_collages["CT_bone"] = df_of_collages["SUV_bone"]
df_of_collages["CT_lean"] = df_of_collages["SUV_lean"]
df_of_collages["CT_adipose"] = df_of_collages["SUV_adipose"]
df_of_collages["CT_air"] = df_of_collages["SUV_air"]

df_of_collages["CT_MIP"] = df_of_collages["CT_MIP"].str.replace("SUV_MIP", "CT_MIP")
df_of_collages["CT_bone"] = df_of_collages["CT_bone"].str.replace("SUV_bone", "CT_bone")
df_of_collages["CT_lean"] = df_of_collages["CT_lean"].str.replace("SUV_lean", "CT_lean")
df_of_collages["CT_adipose"] = df_of_collages["CT_adipose"].str.replace("SUV_adipose", "CT_adipose")
df_of_collages["CT_air"] = df_of_collages["CT_air"].str.replace("SUV_air", "CT_air")


df_of_collages = df_of_collages[["patient_ID", "scan_date", "SUV_MIP", "CT_MIP", "SUV_bone", "CT_bone", "SUV_lean", "CT_lean", "SUV_adipose", "CT_adipose", "SUV_air", "CT_air"]]
df_of_collages = df_of_collages.drop_duplicates()
df_of_collages.to_excel("/media/andres/T7 Shield1/UCAN_project/df_of_collages.xlsx", index=False)

  0%|          | 0/1076 [00:00<?, ?it/s]

100%|██████████| 1076/1076 [00:02<00:00, 523.67it/s]


In [124]:
# The following part of the code it is used to find arrays that might have NaN values
df_of_collages = pd.read_excel("/media/andres/T7 Shield1/UCAN_project/df_of_collages.xlsx")
temp = df_of_collages.copy()
temp['incorrect_projection'] = 'False'

for index, row in temp.iterrows():
    arr = np.load(row["SUV_MIP"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["SUV_MIP"])
        continue

    arr = np.load(row["SUV_bone"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["SUV_bone"])
        continue
        
    arr = np.load(row["SUV_lean"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["SUV_lean"])
        continue
        
    arr = np.load(row["SUV_adipose"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["SUV_adipose"])
        continue
        
    arr = np.load(row["SUV_air"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["SUV_air"])
        continue
        
    arr = np.load(row["CT_MIP"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["CT_MIP"])
        continue
        
    arr = np.load(row["CT_bone"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["CT_bone"])
        continue
        
    arr = np.load(row["CT_lean"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["CT_lean"])
        continue
        
    arr = np.load(row["CT_adipose"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["CT_adipose"])
        continue
        
    arr = np.load(row["CT_air"])
    if np.isnan(arr).any():
        temp.at[index, 'incorrect_projection'] = 'True'
        print(row["CT_air"])
        continue

/media/andres/T7 Shield1/UCAN_project/collages/npr207978513481/20200310/SUV_bone.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr209568543213/20180703/SUV_MIP.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr272766092791/20201218/SUV_MIP.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr313844671745/20160627/SUV_bone.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr844745391915/20180316/SUV_bone.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr895431533610/20170522/SUV_MIP.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr925249119000/20181106/SUV_MIP.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr988864921432/20210721/SUV_bone.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr988864921432/20220125/SUV_bone.npy
/media/andres/T7 Shield1/UCAN_project/collages/npr988864921432/20220428/SUV_bone.npy


In [125]:
df_with_nan_arrays = temp[temp['incorrect_projection'] == 'True']
df_with_nan_arrays = df_with_nan_arrays.drop_duplicates()
df_with_nan_arrays = df_with_nan_arrays.drop(columns=["incorrect_projection"])
df_with_nan_arrays.to_excel("/media/andres/T7 Shield1/UCAN_project/df_of_arrays_with_nan_values.xlsx", index=False)
df_of_collages_without_nan_arrays = df_of_collages[~df_of_collages.patient_ID.isin(df_with_nan_arrays.patient_ID)]
print(df_of_collages_without_nan_arrays.shape)


(1801, 12)


In [120]:
# df_of_collages_without_nan_arrays = pd.merge(df_of_collages, df_with_nan_arrays, how="outer", left_on=["patient_ID", "scan_date"], right_on=["patient_ID", "scan_date"], indicator=False,  suffixes=["_l","_r"])
# df_of_collages_without_nan_arrays = df_of_collages_without_nan_arrays.drop(columns=["incorrect_projection"])
# df_of_collages_without_nan_arrays.to_excel("/media/andres/T7 Shield1/UCAN_project/df_of_collages_without_nan_arrays.xlsx", index=False)

In [129]:
df_of_collages_without_nan_arrays = pd.read_excel("/media/andres/T7 Shield1/UCAN_project/collages_data_paths.xlsx")
metadata = pd.read_excel("/media/andres/T7 Shield1/UCAN_project/Finalized_dataset_1805_exams_with_Age.xlsx")
metadata = metadata[["npr", "scan_date", "patient_age"]]
metadata.drop_duplicates(inplace=True)
collages_dataset_with_age = pd.merge(df_of_collages_without_nan_arrays, metadata, how="inner", left_on=["patient_ID", "scan_date"], right_on=["npr", "scan_date"], suffixes=["_l","_r"])

collages_dataset_with_age = collages_dataset_with_age.drop(columns=["npr", "unique_pat_ID_scan_date"])
collages_dataset_with_age.to_excel("/media/andres/T7 Shield1/UCAN_project/dataset_for_model_training.xlsx", index=False)

In [130]:
collages_dataset_with_age.head()

Unnamed: 0,patient_ID,scan_date,SUV_MIP,CT_MIP,SUV_bone,CT_bone,SUV_lean,CT_lean,SUV_adipose,CT_adipose,SUV_air,CT_air,patient_age
0,lpr385705046400,20140313,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,33
1,lpr415675513429,20190201,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,32
2,lpr415675513429,20190320,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,33
3,lpr884752331885,20181116,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,69
4,npr100169878450,20130412,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,/media/andres/T7 Shield1/UCAN_project/collages...,68
