# Original code from thesis
----
Here we'll break down the lines of code for the assignment

## 1. Imports

Nothing fancy. Pandas, numpy, sci-kit learn and torch for ml models. Also some
utilities for file management

In [1]:
import os
import glob
import re
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

## 2. Constant declaration

Remember to customize the path to your own routes. There are basically two main groups:

- TCIA
- OP

Each group should have:

- Original image directory
- Mask image directory
- Excel file with the medical information

Please modify constants as needed

**NOTE**: The github files doesn't include the images, as there are way too heavy.
Please download them separately.

In [2]:
BASE_PATH = 'Data/'

# For the TCIA LOCATIONS
TCIA_WILDCARD = '*_PV.nii.gz'
TCIA_LOCATION = BASE_PATH + 'TCIA/'
TCIA_IMG_PATH = TCIA_LOCATION + 'TCIA_image_PV/'
TCIA_RESULTS_PATH = TCIA_LOCATION + 'TCIA_results_phase_PV/'
TCIA_EXCEL = TCIA_LOCATION + 'HCC-TACE-Seg_clinical_data-V2.xlsx'

# For the OP locations
OP_LOCATION = BASE_PATH + 'OP/'
NIFTI_PATH = OP_LOCATION + 'OP_C+P_nifti'
NNU_NET_PATH = OP_LOCATION + 'OP_C+P_nnUnet'
OP_EXCEL = OP_LOCATION + 'OP_申請建模_1121110_20231223.xlsx'
OP_WILDCARD = '*_VENOUS_PHASE.nii.gz'
OP_SEG_WILDCARD = '*_VENOUS_PHASE_seg.nii.gz'

## 3. File preparation

### TCIA handling

First we'll focus on the TCIA images

In [3]:
# Code for TCIA dataset
def get_files(image_path: str, wildcard: str) -> list:
    '''Gets all the files in a directory matching a wildcard

    Params
    -----
    `image_path`: str
        The directory where all the images are stored
    `wildcard`: str
        The wildcard to match the files

    Returns
    ----
    `list`: A sorted list of strings representing the paths of all the matched elements
    '''
    return sorted(
        glob.glob(
            os.path.join(
                image_path,
                wildcard
            )
        )
    )

Get the paths of the existing files in both images and masks directories

In [4]:
tcia_images = get_files(TCIA_IMG_PATH, TCIA_WILDCARD)
tcia_masks = get_files(TCIA_RESULTS_PATH, TCIA_WILDCARD)

print('Images: ', len(tcia_images))
tcia_images[:3]

Images:  101


['Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
 'Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz',
 'Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz']

In [5]:
print('Masks: ', len(tcia_masks))
tcia_masks[:3]

Masks:  101


['Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
 'Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz',
 'Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz']

Read the excel file with the complementary information of the scans

In [6]:
tcia_df = pd.read_excel(TCIA_EXCEL, sheet_name='data table')

tcia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 57 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   TCIA_ID                         105 non-null    object 
 1   Interval_BL                     105 non-null    int64  
 2   Interval_FU                     97 non-null     float64
 3   TTP                             105 non-null    float64
 4   Death_1_StillAliveorLostToFU_0  105 non-null    int64  
 5   Censored_0_progressed_1         105 non-null    int64  
 6   OS                              105 non-null    float64
 7   hepatitis                       105 non-null    object 
 8   age                             105 non-null    int64  
 9   agegp                           105 non-null    object 
 10  Sex                             105 non-null    int64  
 11  Smoking                         105 non-null    int64  
 12  Alcohol                         105 

In [7]:
def extract_number(pattern, string: str) -> str:
    '''Extracts the number in the given string.
    If the given string is `HCC_001_PV.nii.gz` and the pattern is `r'HCC_(\d+)'`,
    this function will return `001`

    Params
    ---
    `pattern`: `regex`
        The regular expression to match the string to
    `string`: `str`
        The string from which the pattern will be extracted

    Returns
    ---
    `str`: The extracted string
    '''
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    return None

Extract the numbers of the files present in the directory

`Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz` ->       `HCC_001`

In [8]:
numbers_list = ["HCC_" + extract_number(r'HCC_(\d+)', image) for image in tcia_images]

print('Length: ', len(numbers_list))
numbers_list[:5]

Length:  101


['HCC_001', 'HCC_002', 'HCC_003', 'HCC_004', 'HCC_005']

Remove the elements of the excel that has no image in the file system

In [10]:
before = len(tcia_df)
tcia_df = tcia_df[tcia_df['TCIA_ID'].isin(numbers_list)]

print('Removed %d items' % (before - len(tcia_df)))

Removed 0 items


Includes the image and the masks in the dataframe

In [11]:
tcia_df['PVimg_path'] = tcia_df['TCIA_ID'].apply(lambda x: os.path.join(TCIA_IMG_PATH, x+"_PV.nii.gz"))
tcia_df['PVmask_path'] = tcia_df['TCIA_ID'].apply(lambda x: os.path.join(TCIA_RESULTS_PATH, x+"_PV.nii.gz"))

tcia_df[['TCIA_ID', 'PVimg_path', 'PVmask_path']].head()

Unnamed: 0,TCIA_ID,PVimg_path,PVmask_path
0,HCC_001,Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz
1,HCC_002,Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz
2,HCC_003,Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz
3,HCC_004,Data/TCIA/TCIA_image_PV/HCC_004_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_004_PV.nii.gz
4,HCC_005,Data/TCIA/TCIA_image_PV/HCC_005_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_005_PV.nii.gz


As stated in the thesis, the **Stage-D** will be discarded

In [12]:
before = len(tcia_df)
tcia_df = tcia_df[tcia_df['BCLC'] != 'Stage-D']

print('Removed %d Stage-D items' % (before - len(tcia_df)))

Removed 2 Stage-D items


Now let's map the stages to a numerical coding:

* Stage-A -> 0
* Stage-B  -> 1
* Stage-C -> 2

In [13]:
mapping = {'Stage-A': 0, 'Stage-B': 1, 'Stage-C': 2}

pd.set_option('future.no_silent_downcasting', True)
labels = np.array(
    tcia_df['BCLC'].replace(mapping)
)

labels[:5]

array([0, 2, 2, 1, 2], dtype=object)

Extract the image and mask paths into independent arrays (idk why)

In [14]:
PV_images = np.array(tcia_df['PVimg_path'])
PV_masks = np.array(tcia_df['PVmask_path'])

pd.DataFrame({'image': PV_images, 'mask': PV_masks}).head()

Unnamed: 0,image,mask
0,Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz
1,Data/TCIA/TCIA_image_PV/HCC_002_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_002_PV.nii.gz
2,Data/TCIA/TCIA_image_PV/HCC_003_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_003_PV.nii.gz
3,Data/TCIA/TCIA_image_PV/HCC_004_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_004_PV.nii.gz
4,Data/TCIA/TCIA_image_PV/HCC_005_PV.nii.gz,Data/TCIA/TCIA_results_phase_PV/HCC_005_PV.nii.gz


Build a new list composed of the original image, the mask and the label

In [15]:
tcia_train = [
        {"PVimg": PV, "PVmask": PV_mask ,"label": label}
        for PV, PV_mask, label in zip(PV_images, PV_masks, labels)
]
print(len(tcia_train))
tcia_train[0]

99


{'PVimg': 'Data/TCIA/TCIA_image_PV/HCC_001_PV.nii.gz',
 'PVmask': 'Data/TCIA/TCIA_results_phase_PV/HCC_001_PV.nii.gz',
 'label': 0}

In [17]:
print("--------------TCIA dataset--------------")
print("train_files: ",len(tcia_train))
print("PV_images: ",len(PV_images))
print("PV_masks: ",len(PV_masks))

--------------TCIA dataset--------------
train_files:  99
PV_images:  99
PV_masks:  99


Check the weighting

In [18]:
unique_labels, label_counts = np.unique(labels, return_counts=True)

pd.DataFrame({'label': unique_labels, 'count': label_counts})

Unnamed: 0,label,count
0,0,11
1,1,23
2,2,65


In [19]:
total_count = np.sum(label_counts)
print("Total count of labels:", total_count)

Total count of labels: 99


### OP Handling

Let's see what information the excel file has

In [20]:
op_df = pd.read_excel(OP_EXCEL, sheet_name='202211112RINC建模申請-上繳用')
print("%d elements" % op_df.shape[0])

op_df.head()

200 elements


Unnamed: 0,編號,OP_C+P_Tumor識別碼,"Survial(0:alive,1:death)",Expire Date,Last OPD,OS day,CT date,OP Date,Recurrent Date,Recurrent Span / days,...,Pre-Op\nCRE mg/dL,Pre-Op\nPT /sec,Pre-Op\nPT INR,Pre-Op\nAFP ng/mL,number,size,Ishak,腫瘤分級,血管侵犯有(1)跟沒有(0)（detection）,BCLC
0,1,OP_0001,0,,2022/11/15,2519,2015/12/2,2015/12/23,,2519,...,1.7,9.5,0.9,3.37,1,8.9,F1,2,0,
1,2,OP_0003,0,,2022/11/18,2508,2015/12/30,2016/1/6,,2508,...,1.0,11.6,1.1,4.53,1,3.0,F2,2,1,
2,3,OP_0004,0,,2022/10/18,2493,2015/11/12,2015/12/21,2021-12-29 00:00:00,2200,...,1.0,10.3,0.97,3.78,1,13.0,F0,3,0,
3,4,OP_0005,0,,2022/11/1,2493,2015/11/9,2016/1/4,2019-03-15 00:00:00,1166,...,0.9,10.0,0.94,72.86,1,1.6,F1,3,0,
4,5,OP_0006,0,,2022/11/15,2498,2015/12/11,2016/1/13,,2498,...,0.7,10.1,0.95,105.84,1,2.6,F4,3,0,


In [22]:
op_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   編號                         200 non-null    int64  
 1   OP_C+P_Tumor識別碼            200 non-null    object 
 2   Survial(0:alive,1:death)   200 non-null    int64  
 3   Expire Date                41 non-null     object 
 4   Last OPD                   159 non-null    object 
 5   OS day                     200 non-null    int64  
 6   CT date                    200 non-null    object 
 7   OP Date                    200 non-null    object 
 8   Recurrent Date             107 non-null    object 
 9   Recurrent Span / days      200 non-null    int64  
 10  RFS                        200 non-null    float64
 11  Recurrence                 200 non-null    int64  
 12  Pre-Op
ALT U/L             200 non-null    int64  
 13  Pre-Op
AST  U/L            199 non-null    float64

As stated before, we don't want *Stage-D* or *pending* cases

In [23]:
original = len(op_df)
# Remove cases without diagnosis
op_df = op_df.dropna(subset=['BCLC'])
# Remove cases with unknown and D diagnoses
op_df = op_df[op_df['BCLC'] != 'Pending']
op_df = op_df[op_df['BCLC'] != 'D']

print('Removed %d cases' % (original - len(op_df)))

Removed 52 cases


Remove the following items:

- 117
- 277
- 003
- 093

In [26]:
# Remove some rows 
# idk why
condition = op_df['OP_C+P_Tumor識別碼'].str.contains('OP_0117|OP_0277|OP_0003')

print('Mask:')
condition.value_counts()

Mask:


OP_C+P_Tumor識別碼
False    146
True       2
Name: count, dtype: int64

In [34]:
original = len(op_df)

op_df = op_df[~condition]
op_df = op_df[op_df['OP_C+P_Tumor識別碼'] != 'OP_0093']

print("Removed %d elements" % (original - len(op_df)))

Removed 0 elements


  op_df = op_df[~condition]


In [43]:
op_df[['編號', 'OP_C+P_Tumor識別碼']].head()

Unnamed: 0,編號,OP_C+P_Tumor識別碼
49,50,OP_0067
50,51,OP_0068
51,52,OP_0069
52,53,OP_0070
53,54,OP_0071


Extract the IDs of the excel file

In [47]:
ids = np.array(op_df['OP_C+P_Tumor識別碼'])

print(len(ids))
ids[:5]

145


array(['OP_0067', 'OP_0068', 'OP_0069', 'OP_0070', 'OP_0071'],
      dtype=object)

In [48]:
pattern = re.compile(r'OP_(\d+)')
ids = [re.search(pattern, s).group(0) if re.search(pattern, s) else None for s in ids]

print(f'Total ids: {len(ids)} with {ids.count(None)} ids discarded')

Total ids: 145 with 0 ids discarded


Map the stages to numerical labels

- 0 -> 0
- A -> 0
- B -> 1
- C -> 2

In [49]:
mapping = {'0': 0, 'A': 0, 'B': 1, 'C': 2}
labels_op = np.array(op_df['BCLC'].replace(mapping))

print(len(labels_op))
pd.DataFrame({'labels' : labels_op, 'stage': op_df['BCLC']}).head()

145


Unnamed: 0,labels,stage
49,0,A
50,0,A
51,1,B
52,0,A
53,1,B


extract the images and masks only if they have matching data in the excel

In [61]:
op_images = [path for path in get_files(NIFTI_PATH, OP_WILDCARD) if any(id_ in path for id_ in ids)]

print('Images found: ', len(op_images))
op_images[:5]

Images found:  145


['Data/OP/OP_C+P_nifti/OP_0067_VENOUS_PHASE.nii.gz',
 'Data/OP/OP_C+P_nifti/OP_0068_VENOUS_PHASE.nii.gz',
 'Data/OP/OP_C+P_nifti/OP_0069_VENOUS_PHASE.nii.gz',
 'Data/OP/OP_C+P_nifti/OP_0070_VENOUS_PHASE.nii.gz',
 'Data/OP/OP_C+P_nifti/OP_0071_VENOUS_PHASE.nii.gz']

In [54]:
op_masks = [path for path in get_files(NNU_NET_PATH, OP_SEG_WILDCARD) if any(id_[2:] in path for id_ in ids)]

print('Masks found: ', len(op_masks))
op_masks[:5]

Masks found:  145


['Data/OP/OP_C+P_nnUnet/OP_0067_VENOUS_PHASE_seg.nii.gz',
 'Data/OP/OP_C+P_nnUnet/OP_0068_VENOUS_PHASE_seg.nii.gz',
 'Data/OP/OP_C+P_nnUnet/OP_0069_VENOUS_PHASE_seg.nii.gz',
 'Data/OP/OP_C+P_nnUnet/OP_0070_VENOUS_PHASE_seg.nii.gz',
 'Data/OP/OP_C+P_nnUnet/OP_0071_VENOUS_PHASE_seg.nii.gz']

Build the train object

In [67]:
train_files_2 = [
    {"PVimg": PV, "PVmask": PV_mask ,"label": label}
    for PV, PV_mask, label in zip(op_images, op_masks, labels_op)
]

print(len(train_files_2))
train_files_2[0]

145


{'PVimg': 'Data/OP/OP_C+P_nifti/OP_0067_VENOUS_PHASE.nii.gz',
 'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0067_VENOUS_PHASE_seg.nii.gz',
 'label': 0}

In [68]:
print("--------------OP dataset--------------")
print("train_files_2: ",len(train_files_2))
print("PV_images: ",len(op_images))
print("PV_masks: ",len(op_masks))

--------------OP dataset--------------
train_files_2:  145
PV_images:  145
PV_masks:  145


In [70]:
unique_labels, label_counts = np.unique(labels_op, return_counts=True)
for label, count in zip(unique_labels, label_counts):
    print(f"Label: {label}, Count: {count}")

Label: 0, Count: 76
Label: 1, Count: 55
Label: 2, Count: 14


In [71]:
total_count = np.sum(label_counts)
print("Total count of labels:", total_count)

Total count of labels: 145


In [73]:
train_files.extend(train_files_2)
labels = np.concatenate((labels, labels_2), axis=0)

## 4. Training

In [74]:
train_files[0]

{'PVimg': 'Data/OP/OP_C+P_nifti/OP_0067_VENOUS_PHASE.nii.gz',
 'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0067_VENOUS_PHASE_seg.nii.gz',
 'label': 0}

In [61]:
labels[0]

0

In [75]:
train_test_split(
    train_files_2,
    labels,
    shuffle=True,
    test_size=0.4,
    random_state=8,
    stratify=labels
)

[[{'PVimg': 'Data/OP/OP_C+P_nifti/OP_0167_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0167_VENOUS_PHASE_seg.nii.gz',
   'label': 1},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0142_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0142_VENOUS_PHASE_seg.nii.gz',
   'label': 0},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0230_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0230_VENOUS_PHASE_seg.nii.gz',
   'label': 0},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0140_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0140_VENOUS_PHASE_seg.nii.gz',
   'label': 1},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0102_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0102_VENOUS_PHASE_seg.nii.gz',
   'label': 0},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0150_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0150_VENOUS_PHASE_seg.nii.gz',
   'label': 1},
  {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0264_VENOUS_PHASE.nii.gz',
   'PVmask': 'Data/OP/OP_C+P_nnUnet/

In [76]:
X_train, X_test, _, _ = train_test_split(train_files, labels, shuffle=True, test_size=0.4, random_state=8, stratify=labels)

In [77]:
X_train[:5]

[{'PVimg': 'Data/OP/OP_C+P_nifti/OP_0167_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0167_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0142_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0142_VENOUS_PHASE_seg.nii.gz',
  'label': 0},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0230_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0230_VENOUS_PHASE_seg.nii.gz',
  'label': 0},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0140_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0140_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0102_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0102_VENOUS_PHASE_seg.nii.gz',
  'label': 0}]

In [78]:
X_test[:5]

[{'PVimg': 'Data/OP/OP_C+P_nifti/OP_0112_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0112_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0132_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0132_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0159_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0159_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0095_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0095_VENOUS_PHASE_seg.nii.gz',
  'label': 1},
 {'PVimg': 'Data/OP/OP_C+P_nifti/OP_0267_VENOUS_PHASE.nii.gz',
  'PVmask': 'Data/OP/OP_C+P_nnUnet/OP_0267_VENOUS_PHASE_seg.nii.gz',
  'label': 1}]