In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Imports
import os
import tarfile
from glob import glob
import shutil
import SimpleITK as sitk
import pandas as pd
import radiomics
from radiomics import featureextractor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import silhouette_score
import pickle
import json

In [None]:
# Paths
input_dir = "/kaggle/input/brats-2021-task1"
tar_path = os.path.join(input_dir, "BraTS2021_Training_Data.tar")
partial_dir = os.path.join(input_dir, "BraTS2021_Training_Data_part100")
imagesTr_dir = "/kaggle/working/nnUNet_raw/Dataset001_GLI/imagesTr"
labelsTr_dir = "/kaggle/working/nnUNet_raw/Dataset001_GLI/labelsTr"
dataset_json_path = "/kaggle/working/nnUNet_raw/Dataset001_GLI/dataset.json"

# Parameters
modality_indices = {"t1": 0, "t1ce": 1, "t2": 2, "flair": 3}
n_clusters_range = range(2, 10)
n_pca_components = 20
n_folds = 10
random_state = 42

In [None]:
# Step 1: Unzip first 100 subjects
# ----------------------
os.makedirs(partial_dir, exist_ok=True)
with tarfile.open(tar_path, "r:") as tar:
    # Identify first 100 top-level subject dirs
    subject_names = sorted({m.name.split('/')[0] for m in tar.getmembers()})[:100]
    members_to_extract = [m for m in tar.getmembers() if m.name.split('/')[0] in subject_names]
    tar.extractall(path=partial_dir, members=members_to_extract)

In [None]:
# ----------------------
# Step 2: Organize nnU-Net structure
# ----------------------
os.makedirs(imagesTr_dir, exist_ok=True)
os.makedirs(labelsTr_dir, exist_ok=True)

for subject in sorted(os.listdir(partial_dir)):
    pid = subject
    subj_path = os.path.join(partial_dir, subject)
    # Copy modalities
    for mod, idx in modality_indices.items():
        src = os.path.join(subj_path, f"{pid}_{mod}.nii.gz")
        dst = os.path.join(imagesTr_dir, f"{pid}_{idx:04d}.nii.gz")
        shutil.copy(src, dst)
    # Copy label
    shutil.copy(os.path.join(subj_path, f"{pid}_seg.nii.gz"),
                os.path.join(labelsTr_dir, f"{pid}.nii.gz"))

In [None]:
# ----------------------
# Step 3: Radiomics Extraction Class
# ----------------------
class RadiomicsExtractor:
    def __init__(self, images_dir, labels_dir, params=None, modality_idx=3):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.modality_idx = modality_idx  # e.g., flair (index 3)
        self.params = params or {"binWidth": 25}
        self.extractor = featureextractor.RadiomicsFeatureExtractor(**self.params)

    def extract(self):
        features = []
        for label_path in glob(os.path.join(self.labels_dir, "*.nii.gz")):
            pid = os.path.basename(label_path).replace(".nii.gz", "")
            img_path = os.path.join(self.images_dir, f"{pid}_{self.modality_idx:04d}.nii.gz")
            img = sitk.ReadImage(img_path)
            label = sitk.ReadImage(label_path)
            result = self.extractor.execute(img, label)
            result["PatientID"] = pid
            features.append(result)
        return pd.DataFrame(features)

# Instantiate and extract
rad_extractor = RadiomicsExtractor(imagesTr_dir, labelsTr_dir)
df_features = rad_extractor.extract()
df_features.to_csv("gli_radiomics_partial100.csv", index=False)

In [None]:
# ----------------------
# Step 4: Stratified Splitter Class
# ----------------------
class StratifiedSplitter:
    def __init__(self, n_clusters_range, n_pca_components, n_folds, random_state=0):
        self.n_clusters_range = n_clusters_range
        self.n_pca_components = n_pca_components
        self.n_folds = n_folds
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=self.n_pca_components, random_state=self.random_state)
        self.kmeans = None
        self.best_k = None

    def fit_cluster(self, df):
        X = df.drop(columns=["PatientID"])
        X_scaled = self.scaler.fit_transform(X)
        X_pca = self.pca.fit_transform(X_scaled)

        # Grid search for optimal clusters
        best_score = -1
        for k in self.n_clusters_range:
            km = KMeans(n_clusters=k, random_state=self.random_state)
            labels = km.fit_predict(X_pca)
            score = silhouette_score(X_pca, labels)
            if score > best_score:
                best_score = score
                self.best_k = k

        # Final clustering
        self.kmeans = KMeans(n_clusters=self.best_k, random_state=self.random_state)
        cluster_labels = self.kmeans.fit_predict(X_pca)
        df["Cluster"] = cluster_labels
        return df, X_pca
     def create_folds(self, df):
        y = df["Cluster"].values
        pids = df["PatientID"].values
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)

        for fold, (tr, va) in enumerate(skf.split(pids, y)):
            with open(f"fold_{fold:02d}_train.txt", "w") as f:
                f.write("\n".join(pids[tr]))
            with open(f"fold_{fold:02d}_val.txt",   "w") as f:
                f.write("\n".join(pids[va]))
        print(f"✅ Created {self.n_folds} stratified folds (k={self.best_k} clusters).")

In [None]:
# ----------------------
# 5) Run extraction + clustering + fold creation
# clustering + fold splits
splitter = StratifiedSplitter(
    n_clusters_range=range(2,10),
    n_pca_components=20,
    n_folds=10,
    random_state=42
)
df_clustered = splitter.fit_cluster(df_feats)
splitter.create_folds(df_clustered)

In [None]:
split_list = []

for fold in range(10):
    with open(f"fold_{fold:02d}_train.txt") as f:
        train_ids = [pid.strip() for pid in f.readlines()]
    with open(f"fold_{fold:02d}_val.txt") as f:
        val_ids = [pid.strip() for pid in f.readlines()]
    
    split_list.append({
        'train': train_ids,
        'val': val_ids
    })

json_path = "/kaggle/working/nnUNet_preprocessed/Dataset001_GLI/splits_final.json"
with open(json_path, "w") as f:
    json.dump(split_list, f, indent=4)

print(f"✅ Saved custom split file to: {json_path}")


**nnU-Net**

In [None]:
#dataset.json file
# Count number of training cases
# filenames like <patientID>_0000.nii.gz
filenames = glob.glob(os.path.join(imagesTr_dir, "*.nii.gz"))
patient_ids = set(os.path.basename(f).split("_")[0] for f in filenames)
num_training = len(patient_ids)

# Construct dataset.json content
dataset_json = {
    "channel_names": {
        "0": "T1",
        "1": "T1CE",
        "2": "T2",
        "3": "FLAIR"
    },
    "labels": {
        "background": 0,
        "whole_tumor": [1, 2, 3],
        "tumor_core": [2, 3],
        "enhancing_tumor": 3
    },
    "regions_class_order": [1, 2, 3],
    "numTraining": num_training,  #100
    "file_ending": ".nii.gz",
    "overwrite_image_reader_writer": "SimpleITKIO"
}

# Ensure directory exists
os.makedirs(os.path.dirname(dataset_json_path), exist_ok=True)

# Write to file
with open(dataset_json_path, "w") as f:
    json.dump(dataset_json, f, indent=4)

dataset_json_path


In [None]:
git clone https://github.com/MIC-DKFZ/nnUNet.git
cd nnUNet
pip install -e .

In [None]:
# 6.1) Set nnU-Net environment variables
export nnUNet_raw_data_base="/kaggle/working/nnUNet_raw"
export nnUNet_preprocessed="/kaggle/working/nnUNet_preprocessed"
export nnUNet_results="/kaggle/working/nnUNet_results"

In [None]:
#Install Hidden Layer
pip install --upgrade git+https://github.com/FabianIsensee/hiddenlayer.git

In [None]:
# Plan & preprocess 
nnUNetv2_plan_and_preprocess -d Dataset001_GLI --verify_dataset_integrity

In [None]:
# Train all folds for 3d_fullres nnU-Net v2

nnUNetv2_train Dataset001_GLI 3d_fullres all --epochs 100 --npz --val best --c