In [None]:
import os
import glob
import re
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split


## Load Data
Remember to change the path to your own path

In [None]:
# Code for TCIA dataset
def PV_raw_TCIA(image_path):
    PV_images = sorted(glob.glob(os.path.join(image_path, "*_PV.nii.gz")))
    return PV_images

def PV_mask_TCIA(mask_path):
    PV_masks = sorted(glob.glob(os.path.join(mask_path,  "*_PV.nii.gz")))
    return PV_masks

# Code for OP dataset
def PV_raw_OP(image_path):
    PV_images = sorted(glob.glob(os.path.join(image_path, "*.nii.gz")))
    return PV_images

def PV_mask_OP(mask_path):
    PV_masks = sorted(glob.glob(os.path.join(mask_path,  "*.nii.gz")))
    return PV_masks

def load_BCLC_label_OP(clinical_data_path):
    clinical_df = pd.read_excel(clinical_data_path, sheet_name='202211112RINC建模申請-上繳用')
    clinical_df = clinical_df.dropna(subset=['BCLC'])
    clinical_df = clinical_df[clinical_df['BCLC'] != 'Pending']
    clinical_df = clinical_df[clinical_df['BCLC'] != 'D']
    # 濾掉clinical_df資料
    condition = clinical_df['OP_C+P_Tumor識別碼'].str.contains('OP_0117|OP_0277|OP_0003')
    clinical_df = clinical_df[~condition]

    # remove OP_0093
    clinical_df = clinical_df[clinical_df['OP_C+P_Tumor識別碼'] != 'OP_0093']

    ids = np.array(clinical_df['OP_C+P_Tumor識別碼'])
    pattern = re.compile(r'OP_(\d+)')
    ids = [re.search(pattern, s).group(0) if re.search(pattern, s) else None for s in ids]

    mapping = {'0': 0, 'A': 0, 'B': 1, 'C': 2}
    labels = np.array(clinical_df['BCLC'].replace(mapping))

    return ids, labels

def prepare_data_all():
    #TODO: change the path to your own path
    image_path = "/home/hpyu/MOHW/test/TCIA_image_PV"
    mask_path = "/home/hpyu/MOHW/test/TCIA_results_phase_PV"
    clinical_data_path = "/home/hpyu/MOHW/data/HCC-TACE-Seg_clinical_data-V2.xlsx"
    PV_images = PV_raw_TCIA(image_path)
    PV_masks = PV_mask_TCIA(mask_path)
    clinical_df = pd.read_excel(clinical_data_path, sheet_name='data table')
    pattern = r'HCC_(\d+)'
    def extract_number(string):
        match = re.search(pattern, string)
        if match:
            return match.group(1)
        return None
    numbers_list = ["HCC_"+extract_number(image) for image in PV_images]
    clinical_df = clinical_df[clinical_df['TCIA_ID'].isin(numbers_list)]
    clinical_df['PVimg_path'] = clinical_df['TCIA_ID'].apply(lambda x: os.path.join(image_path, x+"_PV.nii.gz"))
    clinical_df['PVmask_path'] = clinical_df['TCIA_ID'].apply(lambda x: os.path.join(mask_path, x+"_PV.nii.gz"))
    clinical_df = clinical_df[clinical_df['BCLC'] != 'Stage-D']
    mapping = {'Stage-A': 0, 'Stage-B': 1, 'Stage-C': 2}
    labels = np.array(clinical_df['BCLC'].replace(mapping))
    PV_images = np.array(clinical_df['PVimg_path'])
    PV_masks = np.array(clinical_df['PVmask_path'])
    train_files = [
        {"PVimg": PV, "PVmask": PV_mask ,"label": label}
        for PV, PV_mask, label in zip(PV_images, PV_masks, labels)
        ]


    #########################################
    print("--------------TCIA dataset--------------")
    print("train_files: ",len(train_files))
    print("PV_images: ",len(PV_images))
    print("PV_masks: ",len(PV_masks))

    unique_labels, label_counts = np.unique(labels, return_counts=True)

    for label, count in zip(unique_labels, label_counts):
        print(f"Label: {label}, Count: {count}")

    total_count = np.sum(label_counts)
    print("Total count of labels:", total_count)
    #########################################

    #TODO: change the path to your own path
    image_path = "/home/hpyu/MOHW/data/OP_C+P_nifti"
    mask_path = "/home/hpyu/MOHW/data/OP_C+P_nnUnet"
    clinical_data_path = "/home/hpyu/MOHW/data/OP_申請建模_1121110_20231223.xlsx"

    ids, labels_2 = load_BCLC_label_OP(clinical_data_path)
    PV_images = [path for path in PV_raw_OP(image_path) if any(id_ in path for id_ in ids)]
    PV_masks = [path for path in PV_mask_OP(mask_path) if any(id_[2:] in path for id_ in ids)]

    train_files_2 = [
    {"PVimg": PV, "PVmask": PV_mask ,"label": label}
    for PV, PV_mask, label in zip(PV_images, PV_masks, labels_2)
    ]

    #########################################
    print("--------------OP dataset--------------")
    print("train_files_2: ",len(train_files_2))
    print("PV_images: ",len(PV_images))
    print("PV_masks: ",len(PV_masks))

    unique_labels, label_counts = np.unique(labels_2, return_counts=True)
    for label, count in zip(unique_labels, label_counts):
        print(f"Label: {label}, Count: {count}")

    total_count = np.sum(label_counts)
    print("Total count of labels:", total_count)
    #########################################


    train_files.extend(train_files_2)
    labels = np.concatenate((labels, labels_2), axis=0)

    return train_files, labels


train_files, labels = prepare_data_all()

--------------TCIA dataset--------------
train_files:  99
PV_images:  99
PV_masks:  99
Label: 0, Count: 11
Label: 1, Count: 23
Label: 2, Count: 65
Total count of labels: 99
--------------OP dataset--------------
train_files_2:  145
PV_images:  145
PV_masks:  145
Label: 0, Count: 76
Label: 1, Count: 55
Label: 2, Count: 14
Total count of labels: 145


In [None]:
X_train, X_test, _, _ = train_test_split(train_files, labels, shuffle=True, test_size=0.4, random_state=8, stratify=labels)