# Original code from thesis
----
Here we'll break down the lines of code for the assignment

## 1. Imports

Nothing fancy. Pandas, numpy, sci-kit learn and torch for ml models. Also some
utilities for file management

In [1]:
import os
import glob
import re
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

## 2. Constant declaration

Remember to change the path to your own path. There are basically two main groups:

- TCIA
- OP

Each group should have:

- Original image directory
- Mask image directory
- Excel file with the medical information

Please modify constants as needed

In [2]:
BASE_PATH = 'Data/'
IMG_SUFFIX = '_PV.nii.gz'
WILDCARD_IMG = '*' + IMG_SUFFIX

# For the TCIA LOCATIONS
TCIA_LOCATION = BASE_PATH + 'TCIA/'
TCIA_IMG_PATH = TCIA_LOCATION + 'TCIA_image_PV/'
TCIA_RESULTS_PATH = TCIA_LOCATION + 'TCIA_results_phase_PV/'
TCIA_EXCEL = TCIA_LOCATION + 'HCC-TACE-Seg_clinical_data-V2.xlsx'

# For the OP locations
OP_LOCATION = BASE_PATH + 'OP/'
NIFTI_PATH = OP_LOCATION + 'OP_C+P_nifti'
NNU_NET_PATH = OP_LOCATION + 'OP_C+P_nnUnet'
OP_EXCEL = OP_LOCATION + 'OP_申請建模_1121110_20231223.xlsx'

## 3. File preparation

### TCIA handling

First we'll focus on the TCIA images

In [3]:
# Code for TCIA dataset
def PV_raw_TCIA(image_path):
    PV_images = sorted(glob.glob(os.path.join(image_path, WILDCARD_IMG)))
    return PV_images

def PV_mask_TCIA(mask_path):
    PV_masks = sorted(glob.glob(os.path.join(mask_path,  WILDCARD_IMG)))
    return PV_masks

In [4]:
PV_images = PV_raw_TCIA(TCIA_IMG_PATH)
PV_masks = PV_mask_TCIA(TCIA_RESULTS_PATH)

PV_images[:3]

['Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
 'Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz',
 'Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz']

In [5]:
PV_masks[:3]

['Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
 'Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz',
 'Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz']

In [6]:
clinical_df = pd.read_excel(TCIA_EXCEL, sheet_name='data table')

clinical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 57 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   TCIA_ID                         105 non-null    object 
 1   Interval_BL                     105 non-null    int64  
 2   Interval_FU                     97 non-null     float64
 3   TTP                             105 non-null    float64
 4   Death_1_StillAliveorLostToFU_0  105 non-null    int64  
 5   Censored_0_progressed_1         105 non-null    int64  
 6   OS                              105 non-null    float64
 7   hepatitis                       105 non-null    object 
 8   age                             105 non-null    int64  
 9   agegp                           105 non-null    object 
 10  Sex                             105 non-null    int64  
 11  Smoking                         105 non-null    int64  
 12  Alcohol                         105 

In [7]:
def extract_number(string):
        match = re.search(r'HCC_(\d+)', string)
        if match:
            return match.group(1)
        return None
numbers_list = ["HCC_"+extract_number(image) for image in PV_images]

numbers_list[:5]

['HCC_001', 'HCC_002', 'HCC_003', 'HCC_004', 'HCC_005']

In [8]:
before = len(clinical_df)
clinical_df = clinical_df[clinical_df['TCIA_ID'].isin(numbers_list)]

print('Removed %d items' % (before - len(clinical_df)))

Removed 4 items


In [9]:
clinical_df['PVimg_path'] = clinical_df['TCIA_ID'].apply(lambda x: os.path.join(TCIA_IMG_PATH, x+"_PV.nii.gz"))
clinical_df['PVmask_path'] = clinical_df['TCIA_ID'].apply(lambda x: os.path.join(TCIA_RESULTS_PATH, x+"_PV.nii.gz"))

clinical_df[['TCIA_ID', 'PVimg_path', 'PVmask_path']].head()

Unnamed: 0,TCIA_ID,PVimg_path,PVmask_path
0,HCC_001,Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz
1,HCC_002,Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz
2,HCC_003,Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz
3,HCC_004,Data/TCIA/TCIA_image_PV/HCC_004_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_004_PV.nii.gz
4,HCC_005,Data/TCIA/TCIA_image_PV/HCC_005_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_005_PV.nii.gz


In [10]:
before = len(clinical_df)
clinical_df = clinical_df[clinical_df['BCLC'] != 'Stage-D']

print('Removed %d Stage-D items' % (before - len(clinical_df)))

Removed 2 Stage-D items


In [11]:
mapping = {'Stage-A': 0, 'Stage-B': 1, 'Stage-C': 2}
labels = np.array(clinical_df['BCLC'].replace(mapping))

labels[:5]

  labels = np.array(clinical_df['BCLC'].replace(mapping))


array([0, 2, 2, 1, 2])

In [12]:
PV_images = np.array(clinical_df['PVimg_path'])
PV_masks = np.array(clinical_df['PVmask_path'])

pd.DataFrame({'image': PV_images, 'mask': PV_masks}).head()

Unnamed: 0,image,mask
0,Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz
1,Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz
2,Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz
3,Data/TCIA/TCIA_image_PV/HCC_004_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_004_PV.nii.gz
4,Data/TCIA/TCIA_image_PV/HCC_005_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_005_PV.nii.gz


In [13]:
train_files = [
        {"PVimg": PV, "PVmask": PV_mask ,"label": label}
        for PV, PV_mask, label in zip(PV_images, PV_masks, labels)
]

train_files[:3]

[{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
  'label': 0},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz',
  'label': 2}]

In [14]:
print("--------------TCIA dataset--------------")
print("train_files: ",len(train_files))
print("PV_images: ",len(PV_images))
print("PV_masks: ",len(PV_masks))

--------------TCIA dataset--------------
train_files:  99
PV_images:  99
PV_masks:  99


In [15]:
unique_labels, label_counts = np.unique(labels, return_counts=True)

pd.DataFrame({'label': unique_labels, 'count': label_counts})

Unnamed: 0,label,count
0,0,11
1,1,23
2,2,65


In [16]:
total_count = np.sum(label_counts)
print("Total count of labels:", total_count)

Total count of labels: 99


### OP Handling DO NOT EXECUTE, STILL DEBUGGING

In [17]:
# Code for OP dataset
def PV_raw_OP(image_path):
    PV_images = sorted(glob.glob(os.path.join(image_path, "*.nii.gz")))
    return PV_images

def PV_mask_OP(mask_path):
    PV_masks = sorted(glob.glob(os.path.join(mask_path,  "*.nii.gz")))
    return PV_masks

In [18]:
def load_BCLC_label_OP(clinical_data_path):
    clinical_df = pd.read_excel(clinical_data_path, sheet_name='202211112RINC建模申請-上繳用')
    clinical_df = clinical_df.dropna(subset=['BCLC'])
    clinical_df = clinical_df[clinical_df['BCLC'] != 'Pending']
    clinical_df = clinical_df[clinical_df['BCLC'] != 'D']
    # 濾掉clinical_df資料
    condition = clinical_df['OP_C+P_Tumor識別碼'].str.contains('OP_0117|OP_0277|OP_0003')
    clinical_df = clinical_df[~condition]

    # remove OP_0093
    clinical_df = clinical_df[clinical_df['OP_C+P_Tumor識別碼'] != 'OP_0093']

    ids = np.array(clinical_df['OP_C+P_Tumor識別碼'])
    pattern = re.compile(r'OP_(\d+)')
    ids = [re.search(pattern, s).group(0) if re.search(pattern, s) else None for s in ids]

    mapping = {'0': 0, 'A': 0, 'B': 1, 'C': 2}
    labels = np.array(clinical_df['BCLC'].replace(mapping))

    return ids, labels

ids, labels_2 = load_BCLC_label_OP(OP_EXCEL)
PV_images = [path for path in PV_raw_OP(NIFTI_PATH) if any(id_ in path for id_ in ids)]
PV_masks = [path for path in PV_mask_OP(NNU_NET_PATH) if any(id_[2:] in path for id_ in ids)]

train_files_2 = [
{"PVimg": PV, "PVmask": PV_mask ,"label": label}
for PV, PV_mask, label in zip(PV_images, PV_masks, labels_2)
]

#########################################
print("--------------OP dataset--------------")
print("train_files_2: ",len(train_files_2))
print("PV_images: ",len(PV_images))
print("PV_masks: ",len(PV_masks))

unique_labels, label_counts = np.unique(labels_2, return_counts=True)
for label, count in zip(unique_labels, label_counts):
    print(f"Label: {label}, Count: {count}")

total_count = np.sum(label_counts)
print("Total count of labels:", total_count)

# train_files.extend(train_files_2)
# labels = np.concatenate((labels, labels_2), axis=0)

--------------OP dataset--------------
train_files_2:  3
PV_images:  59
PV_masks:  3
Label: 0, Count: 76
Label: 1, Count: 55
Label: 2, Count: 14
Total count of labels: 145


  labels = np.array(clinical_df['BCLC'].replace(mapping))


## 4. Training

In [19]:
train_files[0]

{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
 'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
 'label': 0}

In [20]:
labels[0]

0

In [21]:
train_test_split(
    train_files,
    labels,
    shuffle=True,
    test_size=0.4,
    random_state=8,
    stratify=labels
)

[[{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
   'label': 0},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_015_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_015_PV.nii.gz',
   'label': 1},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_020_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_020_PV.nii.gz',
   'label': 2},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_045_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_045_PV.nii.gz',
   'label': 2},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_075_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_075_PV.nii.gz',
   'label': 2},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_009_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_009_PV.nii.gz',
   'label': 2},
  {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_006_PV.nii.gz',
   'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_006_PV.nii.gz',
   'label': 2},
  {'PVimg': 'Data/TCIA/TCIA

In [22]:
X_train, X_test, _, _ = train_test_split(train_files, labels, shuffle=True, test_size=0.4, random_state=8, stratify=labels)

In [24]:
X_train[:5]

[{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
  'label': 0},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_015_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_015_PV.nii.gz',
  'label': 1},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_020_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_020_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_045_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_045_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_075_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_075_PV.nii.gz',
  'label': 2}]

In [25]:
X_test[:5]

[{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_079_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_079_PV.nii.gz',
  'label': 1},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_016_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_016_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_093_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_093_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_049_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_049_PV.nii.gz',
  'label': 2},
 {'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_071_PV.nii.gz',
  'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_071_PV.nii.gz',
  'label': 1}]