# To create new data set:

In [10]:
import os
import numpy as np
import nibabel as nib
import random
import csv
import matplotlib.pyplot as plt
import pandas as pd
import glob


In [3]:
import subprocess

try:
    subprocess.run(['bash', 'z_mount_usb.sh'], check=True)
    print("Mount succeeded via bash script!")
except subprocess.CalledProcessError as e:
    print("Mount failed:", e)


Mount succeeded via bash script!


In [4]:
def get_paths(patient_id):
    pat = f"{patient_id:03d}"  # ensure 3 digits
    root = f"{BRA_TS_ROOT}{pat}"
    img_path = os.path.join(root, f"BraTS20_Training_{pat}_{IMAGE_TYPE}{EXT}")
    mask_path = os.path.join(root, f"BraTS20_Training_{pat}_{MASK_TYPE}{EXT}")
    return img_path, mask_path


def load_volume(image_path, mask_path):
    img = nib.load(image_path).get_fdata()      # (240,240,155)
    mask = nib.load(mask_path).get_fdata()
    # transpose to (240,240,depth) like your pipeline
    return img.transpose(1, 0, 2), mask.transpose(1, 0, 2)


def compute_tumor_stats(mask_slice):
    """
    Returns:
      - percentage of tumor core
      - centroid (x,y) of tumor core (if exists)
    """
    h, w = mask_slice.shape
    tumor_mask = np.isin(mask_slice, TUMOR_LABELS)

    tumor_pixels = np.sum(tumor_mask)
    total_pixels = h * w
    percent = tumor_pixels / total_pixels

    if tumor_pixels == 0:
        return percent, None

    ys, xs = np.where(tumor_mask)
    cx = float(np.mean(xs))
    cy = float(np.mean(ys))
    return percent, (cx, cy)


def classify_position_loose(centroid):
    if centroid is None:
        return None

    cx, cy = centroid
    H, W = 240, 240

    # 20% margins instead of 25% → larger variety
    margin = 0.20 * W   # = 48px

    if cx < margin:
        return "left"
    if cx > (W - margin):
        return "right"
    if cy < margin:
        return "top"
    if cy > (H - margin):
        return "bottom"

    return "center"



def save_slice(patient_id, slice_idx, brain_slice, mask_slice):
    """
    Saves .npy files exactly like your current pipeline
    """
    # ensure dirs
    os.makedirs(OUTPUT_NPY_DIR, exist_ok=True)
    os.makedirs(OUTPUT_PNG_DIR, exist_ok=True)

    base = f"{patient_id:03d}_{slice_idx}"

    np.save(os.path.join(OUTPUT_NPY_DIR, base + ".npy"), brain_slice)
    np.save(os.path.join(OUTPUT_NPY_DIR, base + "_mask.npy"), mask_slice)

    # Optional PNGs (comment out if not needed)
    import matplotlib.pyplot as plt
    plt.imsave(os.path.join(OUTPUT_PNG_DIR, base + ".png"), brain_slice, cmap="gray")
    plt.imsave(os.path.join(OUTPUT_PNG_DIR, base + "_mask.png"), mask_slice, cmap="hot")


In [5]:
def build_csv_simple(num_slices_per_patient=3, patient_range=range(1, 370)):
    """
    For each patient, pick slices based on tumor size:
        - Largest tumor slice
        - 65th percentile
        - 25th percentile
    Writes ONLY a CSV (no npy, no png).
    """

    # Start fresh
    with open(OUTPUT_CSV, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Patient", "SliceIndex"])

    print("Building simple, size-based dataset...")

    for pid in patient_range:
        img_path, mask_path = get_paths(pid)
        print(img_path, mask_path)
        if not os.path.exists(img_path) or not os.path.exists(mask_path):
            continue
        try:
            brain, mask = load_volume(img_path, mask_path)
        except:
            print(f"Skipping patient {pid:03d} (load error)")
            continue

        depth = brain.shape[2]

        # Compute tumor size for each slice
        sizes = []
        for s in range(depth):
            mask_slice = mask[:, :, s]
            tumor_pixels = int(np.sum(np.isin(mask_slice, TUMOR_LABELS)))
            if tumor_pixels > 0:       # Ensure there's tumor
                sizes.append((s, tumor_pixels))

        if len(sizes) < 3:
            continue  # not enough tumor slices → skip patient

        # Sort by tumor size descending
        sizes.sort(key=lambda x: x[1], reverse=True)

        indexes = [sizes[0][0]]  # largest slice

        if len(sizes) >= 3:
            idx_65 = sizes[int(0.65 * (len(sizes) - 1))][0]
            idx_25 = sizes[int(0.25 * (len(sizes) - 1))][0]
            indexes.extend([idx_65, idx_25])

        # Optional: Add a location-diversity slice
        if num_slices_per_patient == 4:
            # Find slice with centroid far from largest slice's centroid
            def centroid(mask_slice):
                ys, xs = np.where(np.isin(mask_slice, TUMOR_LABELS))
                return (np.mean(xs), np.mean(ys))

            first_centroid = centroid(mask[:, :, sizes[0][0]])

            distances = []
            for s, _ in sizes:
                c = centroid(mask[:, :, s])
                d = np.linalg.norm(np.array(c) - np.array(first_centroid))
                distances.append((d, s))

            # pick slice with farthest centroid
            farthest_slice = max(distances)[1]
            indexes.append(farthest_slice)

        # Write to CSV
        with open(OUTPUT_CSV, "a", newline="") as f:
            writer = csv.writer(f)
            for s in indexes[:num_slices_per_patient]:
                writer.writerow([f"{pid:03d}", s])

        del brain, mask

    print(f"\nDone. CSV written to {OUTPUT_CSV}")


In [6]:
def export_from_csv(csv_path, out_npy="training_set_npy", out_png="training_set_png"):
    os.makedirs(out_npy, exist_ok=True)
    os.makedirs(out_png, exist_ok=True)

    with open(csv_path, "r") as f:
        rows = list(csv.DictReader(f))

    patient_slices = {}
    for r in rows:
        pid = int(r["Patient"])
        s = int(r["SliceIndex"])
        patient_slices.setdefault(pid, []).append(s)

    print("Exporting slices...")

    for pid, slices in patient_slices.items():
        img_path, mask_path = get_paths(pid)
        brain, mask = load_volume(img_path, mask_path)

        for s in slices:
            base = f"{pid:03d}_{s}"
            np.save(os.path.join(out_npy, base + ".npy"), brain[:, :, s])
            np.save(os.path.join(out_npy, base + "_mask.npy"), mask[:, :, s])

            plt.imsave(os.path.join(out_png, base + ".png"), brain[:, :, s], cmap="gray")
            plt.imsave(os.path.join(out_png, base + "_mask.png"), mask[:, :, s], cmap="hot")

        del brain, mask

    print("Finished exporting.")


In [None]:
BRA_TS_ROOT = "/mnt/d/MICCAI_BraTS_2020/BraTS20_Training/BraTS20_Training_"
IMAGE_TYPE = "t1ce"
MASK_TYPE = "seg"
EXT = ".nii"

OUTPUT_NPY_DIR = "training_set_npy"
OUTPUT_PNG_DIR = "training_set_png"
OUTPUT_CSV = "SET.csv"

# Tumor core = labels 1 and 4
TUMOR_LABELS = [1, 4]

MIN_TUMOR_PERCENT = 0.005   # 0.5%
build_csv_simple(num_slices_per_patient=3, patient_range=range(1, 370))


In [None]:
# after CSV is built:
df = pd.read_csv("SET.csv", dtype=str)
# sample slices THAT ARE NOT IN testing_set.csv but keep in order
# ENSURE THAT PATIENTS HAVE IDs OF 3 DIGITS USE :03d

df_testing = pd.read_csv("testing_set.csv", dtype=str)

pad = lambda x: f"{int(x):03d}"

for d in [df, df_testing]:
    d["Patient"] = d["Patient"].map(pad)

df = df[~df.set_index(["Patient", "SliceIndex"]).index.isin(df_testing.set_index(["Patient", "SliceIndex"]).index)]
df.to_csv(f"SET_no_testing_overlap.csv", index=False)

## Run this to create a new dataset without testing overlap

In [11]:
n = 200

df = pd.read_csv("SET_no_testing_overlap.csv", dtype=str)
df = df.sample(n=n, random_state=42).sort_values(by=["Patient", "SliceIndex"])

df.to_csv(f"training_set_{n}.csv", index=False)
print(f"Sampling done. {n} slices saved to training_set_{n}.csv")

Sampling done. 200 slices saved to training_set_200.csv


## Run to create npy and pgn:

In [17]:
folder = "/home/martina/codi2/4year/tfg/training_set_200_npy"

# Compute global min/max using only IMAGE files (exclude masks)
image_paths = [p for p in glob.glob(f"{folder}/*.npy") if "_mask" not in p]

all_min = []
all_max = []

for path in image_paths:
    img = np.load(path)
    all_min.append(img.min())
    all_max.append(img.max())

global_min = min(all_min)
global_max = max(all_max)

print("Global min/max:", global_min, global_max)

# Normalize all images
for path in image_paths:
    img = np.load(path).astype(np.float32)
    
    img_norm = (img - global_min) / (global_max - global_min + 1e-8)
    img_norm = np.clip(img_norm, 0, 1)
    
    np.save(path, img_norm)

# Verify
all_min_after = []
all_max_after = []

for path in image_paths:
    img = np.load(path)
    all_min_after.append(img.min())
    all_max_after.append(img.max())

print("After normalization:", min(all_min_after), max(all_max_after))


Global min/max: 0.0 17114.0
After normalization: 0.0 1.0


0.0 17114.0
