In [None]:
import os
import SimpleITK as sitk
import numpy as np

from classification.dataloaders.data_process_func import read_file_list

In [None]:
def get_bound_coordinate(file, pad=[0, 0, 0]):
    '''
    out the non-zero bound of the file +- pad
    :param file: groundtruth mask
    :param pad: pad for the bound
    :return: bound: [min,max]
    '''
    if isinstance(pad, int):
        pad = [pad, pad, pad]
    file_size = file.shape
    nonzeropoint = np.asarray(np.nonzero(file)) # 3*n
    maxpoint = np.max(nonzeropoint, 1).tolist()
    minpoint = np.min(nonzeropoint, 1).tolist()
    for i in range(len(pad)):
        maxpoint[i] = min(maxpoint[i] + pad[i], file_size[i]-1)
        minpoint[i] = max(minpoint[i] - pad[i], 0)
    return [minpoint, maxpoint]

## Data Preparation

Before cropping the target plain CT scan around your target organ, please ensure your data is prepared as follows:

1. **Standardize the Dataset:**  
   Use the script `preprocess/NifitiStandard.py` to standardize your CT scans. Make sure to prepare both tumor and non-tumor scans.

2. **Segment the Organs in CT:**  
   Use [TotalSegmentator](https://github.com/wasserth/TotalSegmentator) to segment your CT scans. This will provide you with segmentation maps from which you can identify the value corresponding to your target organ (for example, `4` for the gallbladder in Totalseg V2).

3. **Format Your Data:**  
   Organize your dataset into separate folders for each scan (e.g., `001`). Each folder should contain:
   - `image.nii.gz`: The plain CT scan.
   - `label.nii.gz`: The corresponding organ segmentation file.

   For example, your directory structure should look like this:
    ```plaintext
    tumor_ct_root/
        001/
            image.nii.gz
            label.nii.gz
        002/
            image.nii.gz
            label.nii.gz
        ...

In [None]:
'''
Crop the plain CT tumor data around the target organ with padding 32 pixels
'''

tumor_ct_root = '/path/to/your/plainct/tumor/data'
save_root = '/path/to/save/cropped/plainct/tumor/data'
targrt_organ_value = 4 # change it to your target organ value

fold_ls = os.listdir(tumor_ct_root)
os.makedirs(save_root, exist_ok=True)

for index in range(len(fold_ls)):
    fold = fold_ls[index]
    img_path = os.path.join(tumor_ct_root, fold, 'image.nii.gz')
    label_path = os.path.join(tumor_ct_root, fold, 'label.nii.gz')

    img_nii = sitk.ReadImage(img_path)
    label_nii = sitk.ReadImage(label_path)
    
    img_array = sitk.GetArrayFromImage(img_nii)
    label_array = sitk.GetArrayFromImage(label_nii)
    label_array = 1*(label_array==targrt_organ_value)

    # Get the non-zero bound of the label array
    minpoint, maxpoint = get_bound_coordinate(label_array, pad=[32, 32, 32])

    # Crop the image and label array
    crop_img_array = img_array[minpoint[0]:maxpoint[0], minpoint[1]:maxpoint[1], minpoint[2]:maxpoint[2]]
    crop_label_array = label_array[minpoint[0]:maxpoint[0], minpoint[1]:maxpoint[1], minpoint[2]:maxpoint[2]]
    
    # Convert the array to nii
    crop_img_nii = sitk.GetImageFromArray(crop_img_array)
    crop_img_nii.SetDirection(img_nii.GetDirection())
    crop_img_nii.SetSpacing(img_nii.GetSpacing())
    
    crop_label_nii = sitk.GetImageFromArray(crop_label_array)
    crop_label_nii.SetDirection(img_nii.GetDirection())
    crop_label_nii.SetSpacing(img_nii.GetSpacing())

    # Save the nii
    image_save_path = os.path.join(save_root, fold, f'image.nii.gz')
    label_save_path = os.path.join(save_root, fold, f'label.nii.gz')
    os.makedirs(os.path.dirname(image_save_path), exist_ok=True)
    sitk.WriteImage(crop_img_nii, image_save_path)
    sitk.WriteImage(crop_label_nii, label_save_path)
    print('Save', image_save_path)

## 5-fold split

Before 5-fold splitting, please prepare two txt files: `path/to/your/plainct/image.txt` and `path/to/your/plainct/label.txt`.

- `path/to/your/plainct/image.txt`: Contains the file path of each image.
- `path/to/your/plainct/label.txt`: Contains the label for each image, where `1` indicates a tumor and `0` indicates no tumor.

### File Format Example

#### image.txt
Each line contains the file path of an image, for example:

```imagefile
    /path/to/image1.nii.gz
    /path/to/image2.nii.gz
    ...
```

#### label.txt
Each line contains a label [0(non-tumor), 1(tumor)] corresponding to the image in `image.txt`, for example:

```labelfile
    0
    1
    ...
```

In [None]:

'''
Now we split the data into 5-fold for cross-validation, with each class averagely distributed
'''

import os
import random

def read_file_list(filepath):
    """Reads a text file and returns a list of non-empty, stripped lines."""
    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

# --------------------------
# 1) Read the original image.txt and label.txt
# --------------------------
data_file_txt = read_file_list('path/to/your/plainct/image.txt')
label_file_txt = read_file_list('path/to/your/plainct/label.txt')

# Define the directory to save the 5-fold files
fold_save_root = 'config/data/PlainCT/yourdata'
os.makedirs(fold_save_root, exist_ok=True)

fold_num = 5

# --------------------------
# 2) Separate and shuffle indices based on class labels
# --------------------------
# Here we assume there are only two classes: 0 and 1.
# For multi-class tasks, adapt the logic accordingly.
class_indices = {0: [], 1: []}
for idx, label_str in enumerate(label_file_txt):
    label_int = int(label_str.strip())
    class_indices[label_int].append(idx)

# Create placeholders for each fold
folds_by_class = {
    0: [[] for _ in range(fold_num)],
    1: [[] for _ in range(fold_num)]
}

# Distribute the samples of each class into each fold in a round-robin manner
for label, indices in class_indices.items():
    random.shuffle(indices)
    for i, sample_idx in enumerate(indices):
        fold_index = i % fold_num
        folds_by_class[label][fold_index].append(sample_idx)

# --------------------------
# 3) Combine class-based folds to form the final folds
# --------------------------
fold_data = [[] for _ in range(fold_num)]
for i in range(fold_num):
    # Merge the indices for both classes
    fold_indices = folds_by_class[0][i] + folds_by_class[1][i]
    random.shuffle(fold_indices)  # Optional: shuffle within this fold
    fold_data[i] = fold_indices

# --------------------------
# 4) For each fold, create a folder containing the training set and validation set
# --------------------------
for i in range(fold_num):
    # Validation indices are those in the current fold
    valid_indices = fold_data[i]
    # Training indices are the ones in all other folds
    train_indices = []
    for j in range(fold_num):
        if j != i:
            train_indices.extend(fold_data[j])
    
    # Create the directory for the current fold
    fold_dir = os.path.join(fold_save_root, f'fold_{i+1}')
    os.makedirs(fold_dir, exist_ok=True)
    
    # ------------------
    # Validation set
    # ------------------
    valid_image_path = os.path.join(fold_dir, 'valid_image.txt')
    valid_label_path = os.path.join(fold_dir, 'valid_label.txt')
    with open(valid_image_path, 'w') as f_img, open(valid_label_path, 'w') as f_lbl:
        for idx_v in valid_indices:
            f_img.write(data_file_txt[idx_v] + '\n')
            f_lbl.write(label_file_txt[idx_v] + '\n')
    
    # ------------------
    # Training set
    # ------------------
    train_image_path = os.path.join(fold_dir, 'train_image.txt')
    train_label_path = os.path.join(fold_dir, 'train_label.txt')
    with open(train_image_path, 'w') as f_img, open(train_label_path, 'w') as f_lbl:
        for idx_t in train_indices:
            f_img.write(data_file_txt[idx_t] + '\n')
            f_lbl.write(label_file_txt[idx_t] + '\n')
    
    # Print out basic info for this fold
    print(f"Fold {i+1}:")
    print(f"  Train set size: {len(train_indices)}")
    print(f"  Valid set size: {len(valid_indices)}")
    print(f"  Saved in: {fold_dir}\n")

# Optional: print indices for verification
for i, fold in enumerate(fold_data):
    print(f"Fold {i+1} indices: {fold}")



Now we have 5 folds in `config/data/PlainCT/yourdata`, and each fold contains `train_image.txt`, `train_label.txt`, `valid_image.txt`, `valid_label.txt`.