In [None]:
# %% [markdown]
# # LongiTumorSense Model Training
# **Training on MU-Glioma-Post Dataset**
# - Segmentation: nnUNet
# - Classification: 3D DenseNet
# - Survival: CoxPH Model

In [5]:
import nibabel as nib
import numpy  as np
from sklearn.model_selection import train_test_split
import torch
import os
import monai
from monai.data import Dataset ,DataLoader
from monai.transforms import ( Compose , LoadImaged , EnsureChannelFirstd, ScaleIntensityd,RandRotated,RandFlipd,RandZoomd,ToTensord)
from monai.networks.nets import DenseNet121,Unet
from monai.metrics import DiceMetric
from monai.losses import DiceLoss, FocalLoss
import wandb
import pandas as pd
from lifelines import CoxPHFitter


In [1]:
import torch

In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using {device} device.")


Using cpu device.


In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Data Clearner and saved into the drive**

In [None]:
import os
import shutil


raw_root = "/content/drive/My Drive/MU-Glioma-Post"

output_root="/content/drive/My Drive/clean_data"



imagesTr = os.path.join(output_root, "imagesTr")
labelsTr = os.path.join(output_root, "labelsTr")
os.makedirs(os.path.join(output_root, "imagesTs"), exist_ok=True)
os.makedirs(imagesTr, exist_ok=True)
os.makedirs(labelsTr, exist_ok=True)

print("Raw dataset path:", raw_root)
print("nnU-Net dataset path:", output_root)


progress_file = os.path.join(output_root, "converted_cases.txt")

if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        converted_cases = set(line.strip() for line in f)
else:
    converted_cases = set()
print(f"Found {len(converted_cases)} cases already processed.")

Raw dataset path: /content/drive/My Drive/MU-Glioma-Post
nnU-Net dataset path: /content/drive/My Drive/clean_data
Found 4 cases already processed.


In [None]:
def is_nifti(fname):
  return fname.endswith(".nii") or fname.endswith(".nii.gz")

In [None]:
mod_priority=[
      't1c','t1gd','t1ce',  # contrast-enhanced T1 variants
    't1n','t1',           # native T1
    'flair','t2f','t2flair','t2w','t2' # T2 /flair variants
]

In [None]:
def file_priority(fname):
  lf=fname.lower()
  for i,k in enumerate(mod_priority):
    if k in lf:
      return i
  return len(mod_priority) + hash(lf) % 1000

In [None]:
import re
import os
import shutil
from tqdm import tqdm
canonical_modalities = None




skipped = []
new_cases_count = 0





total_timepoints = sum(
    1 for p in sorted(os.listdir(raw_root))
    if os.path.isdir(os.path.join(raw_root, p))
    for tp in sorted(os.listdir(os.path.join(raw_root, p)))
    if os.path.isdir(os.path.join(raw_root, p, tp))
)







with tqdm(total=total_timepoints, desc="Processing cases") as pbar:
    for patient_id in sorted(os.listdir(raw_root)):
        patient_path = os.path.join(raw_root, patient_id)
        if not os.path.isdir(patient_path):
            pbar.update(1)
            continue






        for tp in sorted(os.listdir(patient_path)):
            tp_path = os.path.join(patient_path, tp)
            if not os.path.isdir(tp_path):
                pbar.update(1)
                continue



            tp_clean = re.sub(r"\s+", "_", tp)
            tp_clean = re.sub(r"[^A-Za-z0-9_-]", "_", tp_clean)
            case_id = f"{patient_id}_{tp_clean}"



            if case_id in converted_cases:
                pbar.update(1)
                continue


            files = [f for f in os.listdir(tp_path) if is_nifti(f)]
            if not files:
                skipped.append((patient_id, tp, "no nifti files"))
                pbar.update(1)
                continue



            label_candidates = [f for f in files if any(x in f.lower() for x in ["mask", "tumor", "seg", "label"])]
            if len(label_candidates) == 0:
                skipped.append((patient_id, tp, "no label found"))
                pbar.update(1)
                continue


            label_file = label_candidates[0]



            image_files = [f for f in files if f != label_file]
            if len(image_files) == 0:
                skipped.append((patient_id, tp, "no image files"))
                pbar.update(1)
                continue



            image_files_sorted = sorted(image_files, key=file_priority)
            if canonical_modalities is None:
                canonical_modalities = image_files_sorted.copy()
                print("\nDetected modality order (from first sample)")
                for idx, nm in enumerate(canonical_modalities):
                    print(f"{idx}: {nm}")
                print("If this order is wrong adjust mod_priority list in the script.")


            else:
                if len(image_files_sorted) != len(canonical_modalities):
                    skipped.append(
                        (patient_id, tp, f"modality count mismatch {len(image_files_sorted)} vs {len(canonical_modalities)}")
                    )
                    pbar.update(1)
                    continue



            for i, fname in enumerate(image_files_sorted):
                src = os.path.join(tp_path, fname)
                destination = os.path.join(imagesTr, f"{case_id}_{i:04d}.nii.gz")
                shutil.copy(src, destination)

            shutil.copy2(os.path.join(tp_path, label_file), os.path.join(labelsTr, f"{case_id}.nii.gz"))



            converted_cases.add(case_id)
            with open(progress_file, "a") as f:
                f.write(case_id + "\n")

            new_cases_count += 1
            pbar.update(1)

print(f"\nConversion finished. {len(converted_cases)} total cases processed so far.")
if skipped:
    print(f"{len(skipped)} timepoints skipped (see sample):")
    for s in skipped[:10]:
        print(" ", s)

print(f"imagesTr files: {len(os.listdir(imagesTr))}, labelsTr files: {len(os.listdir(labelsTr))}")
print(f"Newly processed this run: {new_cases_count}")

**Get clean data from drive into local colab for further processing**

In [None]:
from tqdm import tqdm
import os
import shutil


drive_clean_path = "/content/drive/MyDrive/clean_data"
local_clean_path = "/content/clean_data_local"

os.makedirs(local_clean_path,exist_ok=True)


all_files=[]

for root,dirs,files in os.walk(drive_clean_path):
    for file in files:
      source_file=os.path.join(root,file)
      relative_path=os.path.relpath(source_file,drive_clean_path)
      destination_file=os.path.join(local_clean_path,relative_path)
      all_files.append((source_file, destination_file))


remaining_files=[]
for source_file,destination_file in all_files:
    if os.path.exists(destination_file)and os.path.getsize(destination_file)==os.path.getsize(source_file):
       continue
    remaining_files.append((source_file,destination_file))

for source_file,destination_file in tqdm(remaining_files, desc="copying files", unit="files"):
    os.makedirs(os.path.dirname(destination_file),exist_ok=True)
    shutil.copy2(source_file , destination_file)

print(f"copy complete!{len(all_files)-len(remaining_files)} files already exists,{len(remaining_files)} new files copied")
print(" Clean dataset loaded from Drive.")


copying files:   1%|          | 25/2972 [00:16<32:58,  1.49files/s]

**This is for checking the length of file for each imageTr and labelTr**

In [6]:
imagesTr_path=os.path.join(local_clean_path,"imagesTr")
labelsTr_path=os.path.join(local_clean_path,"labelsTr")

length_imageTr=len([f for f in os.listdir(imagesTr_path) if os.path.isfile(os.path.join(imagesTr_path,f))])
length_labelsTr=len([f for f in os.listdir(labelsTr_path) if os.path.isfile(os.path.join(labelsTr_path,f))])

print(f" imageTr files:{length_imageTr}")
print(f" labelsTr files:{length_labelsTr}")

 imageTr files:2376
 labelsTr files:594


**After Disconnect:**

In [None]:
!pip install monai torch torchvision nnunet pyradiomics lifelines pydicom nibabel wandb -q

**Install a Python package directly from its GitHub source code, not from the normal package store (PyPI).”**

**This function loads an MRI file, converts it to a NumPy array, and scales all values to between 0 and 1 for easier analysis.**

In [None]:
!pip install git+https://github.com/MIC-DKFZ/nnUNet.git

In [8]:
import nibabel as nib
import numpy  as np

def load_and_preprocess(patient_path):
    img = nib.load(patient_path)
    data = img.get_fdata()
    data = (data - np.min(data)) / (np.max(data) - np.min(data))
    return data

**Renames the file into nnuNet naming style**

**Copy and rename image files**

**Copy and rename label files**


In [9]:
import os
import re
import shutil
from glob import glob
from tqdm import tqdm

source_images = "/content/drive/MyDrive/clean_data/imagesTr"
source_labels = "/content/drive/MyDrive/clean_data/labelsTr"

destination_imagesTr = "/content/clean_data_local/imagesTr"
destination_labelsTr = "/content/clean_data_local/labelsTr"

os.makedirs(destination_imagesTr, exist_ok=True)
os.makedirs(destination_labelsTr, exist_ok=True)

image_files = glob(os.path.join(source_images, "*.nii.gz"))
label_files = glob(os.path.join(source_labels, "*.nii.gz"))



print(f"Copying {len(image_files)} image files...")


for scan_path in tqdm(image_files, desc="Images copied", unit="file"):
    filename = os.path.basename(scan_path)
    match = re.match(r"(PatientID_\d+)_Timepoint_(\d+)_(\d{4})\.nii\.gz", filename)
    if match:
        patient_id, timepoint, modality_idx = match.groups()
        case_id = f"{patient_id}_Timepoint_{timepoint}"
        destination_path = os.path.join(destination_imagesTr, f"{case_id}_{modality_idx}.nii.gz")
        if scan_path != destination_path:
            shutil.copy(scan_path, destination_path)

print(f"Copying {len(label_files)} label files...")


for label_path in tqdm(label_files, desc="Labels copied", unit="file"):
    filename = os.path.basename(label_path)
    match = re.match(r"(PatientID_\d+)_Timepoint_(\d+)\.nii\.gz", filename)
    if match:
        patient_id, timepoint = match.groups()
        case_id = f"{patient_id}_Timepoint_{timepoint}"
        dst_path = os.path.join(destination_labelsTr, f"{case_id}.nii.gz")
        if label_path != dst_path:
            shutil.copy(label_path, dst_path)

print(f"Total copied {len(os.listdir(destination_imagesTr))} scans to {destination_imagesTr}")
print(f"Total copied {len(os.listdir(destination_labelsTr))} labels to {destination_labelsTr}")


Copying 2376 image files...


Images copied: 100%|██████████| 2376/2376 [02:47<00:00, 14.22file/s]


Copying 594 label files...


Labels copied: 100%|██████████| 594/594 [02:35<00:00,  3.82file/s]

Total copied 2376 scans to /content/clean_data_local/imagesTr
Total copied 594 labels to /content/clean_data_local/labelsTr





**This code creates a dataset.json file that describes your medical imaging dataset for nnU-Net.**

In [12]:
import os
import json
import re


output_root = "/content/clean_data_local"
imagesTr_path = os.path.join(output_root, "imagesTr")
labelsTr_path = os.path.join(output_root, "labelsTr")


num_cases = len([f for f in os.listdir(labelsTr_path) if f.endswith(".nii.gz")])


first_case_files = sorted([f for f in os.listdir(imagesTr_path) if f.endswith(".nii.gz")])
modality_count = len(set([re.search(r'_(\d{4})\.nii\.gz$', f).group(1) for f in first_case_files]))


dataset_json = {
    "name": "MU-Glioma-Post",
    "description": "Post-operative glioma segmentation",
    "reference": "Your reference here",
    "licence": "Your license here",
    "release": "1.0",
    "modality": {str(i): f"MRI_modality_{i}" for i in range(modality_count)},
    "labels": {
        "0": "background",
        "1": "tumor"
    },
    "numTraining": num_cases,
    "file_ending": ".nii.gz"
}


with open(os.path.join(output_root, "dataset.json"), 'w') as f:
    json.dump(dataset_json, f, indent=4)

print(f"dataset.json created at: {os.path.join(output_root, 'dataset.json')}")
print(json.dumps(dataset_json, indent=4))


dataset.json created at: /content/clean_data_local/dataset.json
{
    "name": "MU-Glioma-Post",
    "description": "Post-operative glioma segmentation",
    "reference": "Your reference here",
    "licence": "Your license here",
    "release": "1.0",
    "modality": {
        "0": "MRI_modality_0",
        "1": "MRI_modality_1",
        "2": "MRI_modality_2",
        "3": "MRI_modality_3"
    },
    "labels": {
        "0": "background",
        "1": "tumor"
    },
    "numTraining": 594,
    "file_ending": ".nii.gz"
}


In [None]:
import wandb

# Print your default W&B username (entity)
print("Your W&B username:", wandb.Api().default_entity)

# Alternative: Check after login
wandb.login()
print("Logged in as:", wandb.Api().default_entity)

In [None]:
wandb.init(project="LongiTumorSense",entity="numl-f21-35629-numl")

In [44]:
import os
from sklearn.model_selection import train_test_split

def prepare_dataset(imagesTr, labelsTr, test_size=0.2):

    image_files = [f for f in os.listdir(imagesTr) if f.endswith(".nii.gz")]
    case_ids = sorted(list(set("_".join(f.split("_")[:-1]) for f in image_files)))

    print(f"Found {len(case_ids)} unique cases.")

    # Split into train and test
    train_cases, test_cases = train_test_split(case_ids, test_size=test_size, random_state=42)

    missing_labels = []

    def build_file_list(cases):
        file_list = []
        for case_id in cases:
            # Build list of all 4 modalities for this case
            modalities = [
                os.path.join(imagesTr, f"{case_id}_0000.nii.gz"),  # FLAIR
                os.path.join(imagesTr, f"{case_id}_0001.nii.gz"),  # T1
                os.path.join(imagesTr, f"{case_id}_0002.nii.gz"),  # T1ce
                os.path.join(imagesTr, f"{case_id}_0003.nii.gz")   # T2
            ]
            label_path = os.path.join(labelsTr, f"{case_id}.nii.gz")

            if not os.path.exists(label_path):
                missing_labels.append(case_id)
                continue

            file_list.append({
                "image": modalities,
                "label": label_path,
                "name": case_id
            })
        return file_list

    train_files = build_file_list(train_cases)
    test_files = build_file_list(test_cases)

    print(f"Length of training dataset: {len(train_files)}")
    print(f"Length of validation dataset: {len(test_files)}")

    if missing_labels:
        print(f"Missing labels for {len(missing_labels)} cases: {missing_labels[:10]}{'...' if len(missing_labels) > 10 else ''}")

    return train_files, test_files


In [45]:

train_files, test_files = prepare_dataset(
    "/content/clean_data_local/nnUNet_raw_data/Task001_Glioma/imagesTr",
    "/content/clean_data_local/nnUNet_raw_data/Task001_Glioma/labelsTr"
)


Found 594 unique cases.
Length of training dataset: 475
Length of validation dataset: 119


In [46]:
transform_basic=Compose([
    LoadImaged(keys=["image"]),
    EnsureChannelFirstd(keys=["image"]),
    ToTensord(keys=["image"])

])

In [47]:
batch_size=4
train_dataset_basic=Dataset(data=train_files,transform=transform_basic)
dataset_loader_basic=DataLoader(train_dataset_basic,batch_size=batch_size,shuffle=True)
batch_shape=next(iter(dataset_loader_basic))["image"].shape
print("Getting batches of shape:",batch_shape)

Getting batches of shape: torch.Size([4, 4, 240, 240, 155])


In [55]:
def get_mean_std(dataset_loader_basic):
  """ Computes the mean and std of image data.
  Input :a DataLoader producing tesnors of shpae [batch_size, channesl , pixels_x, pixel_y]
  Output: the mean of each channel as a tensor , the standard deviaton of each channel as a tensor
  formatted as a tuple ( means[channels], std[channels])
  """

  channels_sum,channels_squared_sum, num_batches=0,0,0
  for batch_shape in tqdm(dataset_loader_basic, desc="Computing mean and std", leave=False):
      data = batch_shape["image"]
      channels_sum += torch.mean(data,dim=[0,2,3])
      channels_squared_sum+=torch.mean(data**2,dim=[0,2,3])
      num_batches+=1

  mean=channels_sum/num_batches
  std=(channels_squared_sum/num_batches-mean**2)**0.5

  return mean,std

In [1]:

mean, std = get_mean_std(train_dataset_basic)
print("Mean:", mean)
print("Std:", std)

NameError: name 'get_mean_std' is not defined

In [None]:
train_trainsforms=Compose([
    LoadImaged(keys=["image","label"]),
    EnsureChannelFirstd(keys=["image", "label"]),
    ScaleIntensityd(keys=["image"]),
    RandRotated(keys=["image", "label"],range_x=0.3,prob=0.5),
    RandFlipd(keys=["image", "label"],prob=0.5),
    RandZoomd(keys=["image", "label"],min_zoom=0.9,max_zoom=1.1,prob=0.5),
    NormalizeIntensityd(keys=["image"],subtrahend=mean,divisor=std),
    ToTensord(keys=["image", "label"]),

])


val_transforms = Compose([
    LoadImaged(keys=["image", "label"]),
    EnsureChannelFirstd(keys=["image", "label"]),
    ScaleIntensityd(keys=["image"]),
    NormalizeIntensityd(keys=["image"], subtrahend=mean, divisor=std),
    ToTensord(keys=["image", "label"])
])


In [None]:
train_dataset_norm =Dataset.ImageFolder(data=train_files,transform=train_trainsforms)
dataset_loader_norm=DataLoader(train_dataset_norm,batch_size=batch_size,shuffle=True)
batch_shape=next(iter(dataset_loader_norm))["image"].shape
print("Getting batches of shape:",batch_shape)
print(train_dataset_norm)

In [None]:
batch_size=2
test_dataset_norm=Dataset.ImageFolder(data=test_files,transform=val_transforms)
dataset_loader_test_norm=DataLoader(test_dataset_norm,batch_size=batch_size,shuffle=False)
batch_shape=next(iter(dataset_loader_test_norm))["image"].shape
print("Getting batches of shape:",batch_shape)
print(type(test_dataset_norm))

In [None]:

norm_mean, norm_std = get_mean_std(dataset_loader_norm)

print(f"Mean: {norm_mean}")
print(f"Standard deviation: {norm_std}")


In [None]:
norm_mean, norm_std = get_mean_std(dataset_loader_test_norm)

print(f"Mean: {norm_mean}")
print(f"Standard deviation: {norm_std}")

In [None]:
train_loader=DataLoader(train_ds,batch_size=4,shuffle=True)
val_loader=DataLoader(val_ds,batch_size=2,shuffle=False)
print(type(train_loader))
print(type(val_loader))

<class 'monai.data.dataloader.DataLoader'>
<class 'monai.data.dataloader.DataLoader'>


In [None]:
train_files = random_split(train_dataset_norm ,[0.8])

length_train = len(train_dataset_norm)
length_dataset = len(train_dataset_norm)
percent_train = np.round(100 * length_train / length_dataset, 2)

print(f"Train data is {percent_train}% of full data")


In [None]:
test_files = random_split(test_dataset_norm,[0.2])
length_test = len(val_dataset)
length_dataset = len(test_dataset_norm)
percent_test = np.round(100 * length_test / length_dataset, 2)
print(f"Our Test data is {percent_test}% of full data")

In [None]:
import torch


**# Convert dataset to nnUNet format**

In [None]:
import os
os.environ['nnUNet_raw_data_base'] = '/content/clean_data_local/nnUNet_raw_data'
os.environ['nnUNet_preprocessed'] = '/content/nnUNet_preprocessed'
os.environ['RESULTS_FOLDER'] = '/content/nnUNet_results'


os.makedirs('/content/clean_data_local/nnUNet_raw_data', exist_ok=True)
os.makedirs('/content/nnUNet_preprocessed', exist_ok=True)
os.makedirs('/content/nnUNet_results', exist_ok=True)

print("nnUNet_raw_data_base =", os.environ['nnUNet_raw_data_base'])
print("nnUNet_preprocessed =", os.environ['nnUNet_preprocessed'])
print("RESULTS_FOLDER =", os.environ['RESULTS_FOLDER'])

In [None]:
!mkdir -p /content/clean_data_local/nnUNet_raw_data/Task001_Glioma

In [None]:
!mv /content/clean_data_local/imagesTr /content/clean_data_local/nnUNet_raw_data/Task001_Glioma/
!mv /content/clean_data_local/labelsTr /content/clean_data_local/nnUNet_raw_data/Task001_Glioma/
!mv /content/clean_data_local/dataset.json /content/clean_data_local/nnUNet_raw_data/Task001_Glioma/


In [None]:
!rm -rf /content/nnUNet_preprocessed/*


In [None]:
import os
os.environ['nnUNet_raw_data_base'] = '/content/clean_data_local'



In [None]:
import json

task_ids_path = "/content/clean_data_local/nnunet_task_ids.json"

task_ids = {
    "1": "Task001_Glioma"
}

with open(task_ids_path, "w") as f:
    json.dump(task_ids, f)

print(f"Created {task_ids_path}")


Created /content/clean_data_local/nnunet_task_ids.json


In [None]:
!nnUNet_plan_and_preprocess -t 1 --verify_dataset_integrity









Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

Traceback (most recent call last):
  File "/usr/local/bin/nnUNet_plan_and_preprocess", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nnunet/experiment_planning/nnUNet_plan_and_preprocess.py", line 105, in main
    verify_dataset_integrity(join(nnUNet_raw_data, task_name))
  File "/usr/local/lib/python3.11/dist-packages/nnunet/preprocessing/sanity_checks.py", line 105, in verify_dataset_integrity
    training_cases = dataset['training']
                     ~~~~~~~^^^^^^^^^^^^
KeyError: 'training'


In [None]:
!nnUNet_train 3d_fullres nnUNetTrainerV2 Task001_Glioma 0 --npz

/bin/bash: line 1: nnUNet_train: command not found
