# Preprocessing

## 0. Imports

In [1]:
# Core
import os
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Image processing
import cv2
from PIL import Image

# Machine learning / deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Utility
import zipfile
import shutil
import pathlib

2025-10-24 13:58:01.112466: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-24 13:58:01.115682: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-24 13:58:01.134413: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-24 13:58:01.182801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-24 13:58:01.275320: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

## 1. Data Loading from Kaggle

In [2]:
!pwd

/home/kieren/code/KierenElijaSchmidt/cs_project/notebooks/exploration


In [3]:
!pip install kaggle --quiet

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls -l ~/.kaggle

mv: cannot stat 'kaggle.json': No such file or directory
total 4
-rw------- 1 kieren kieren 74 Oct 24 10:50 kaggle.json


In [5]:
!kaggle datasets list -s "brain tumor"

ref                                                       title                                                    size  lastUpdated                 downloadCount  voteCount  usabilityRating  
--------------------------------------------------------  -------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
jakeshbohaju/brain-tumor                                  Brain Tumor                                          14629270  2020-07-26 01:52:35.323000          24390        323  0.9411765        
masoudnickparvar/brain-tumor-mri-dataset                  Brain Tumor MRI Dataset                             155791278  2021-09-24 12:43:45.510000         150504       1023  0.875            
sartajbhuvaji/brain-tumor-classification-mri              Brain Tumor Classification (MRI)                     91002358  2025-08-12 15:45:16.840000          88882        788  0.875            
jillanisofttech/brain-tumor        

In [6]:
!mkdir -p ../../data
!kaggle datasets download -d masoudnickparvar/brain-tumor-mri-dataset -p ../../data


Dataset URL: https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset
License(s): CC0-1.0
Downloading brain-tumor-mri-dataset.zip to ../../data
 71%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž           | 105M/149M [00:00<00:00, 1.10GB/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 149M/149M [00:00<00:00, 1.10GB/s]


In [None]:
import zipfile, os

zip_path = "../../data/brain-tumor-mri-dataset.zip"
extract_path = "../../data/brain-tumor-mri-dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

os.remove(zip_path)
print("Dataset extracted to:", extract_path)

In [8]:
from pathlib import Path

root = Path("../../data/brain-tumor-mri-dataset")
for d in root.rglob("*"):
    if d.is_dir():
        print(d)

../../data/brain-tumor-mri-dataset/Training
../../data/brain-tumor-mri-dataset/Testing
../../data/brain-tumor-mri-dataset/Training/pituitary
../../data/brain-tumor-mri-dataset/Training/glioma
../../data/brain-tumor-mri-dataset/Training/notumor
../../data/brain-tumor-mri-dataset/Training/meningioma
../../data/brain-tumor-mri-dataset/Testing/pituitary
../../data/brain-tumor-mri-dataset/Testing/glioma
../../data/brain-tumor-mri-dataset/Testing/notumor
../../data/brain-tumor-mri-dataset/Testing/meningioma


In [None]:
import random, cv2
import matplotlib.pyplot as plt
from pathlib import Path

train_dir = Path("../../data/brain-tumor-mri-dataset/Training")

classes = [p.name for p in train_dir.iterdir() if p.is_dir()]
print("Training classes:", classes)

for cls in classes:
    cls_dir = train_dir / cls
    imgs = list(cls_dir.glob("*"))
    print(f"{cls}: {len(imgs)} images")
    if imgs:
        img_path = random.choice(imgs)
        img = cv2.imread(str(img_path))
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Training - {cls}")
        plt.axis("off")
        plt.show()

## 2. Preprocessing

In [None]:
from pathlib import Path
import cv2
import numpy as np
import shutil

# Settings
SRC_ROOT = Path("../../data/brain-tumor-mri-dataset")
DST_ROOT = Path("../../data/brain-tumor-mri-preproc")
TARGET_SIZE = (192, 192)
CLIP_PCT = (1, 99)  # percentile clipping for robust normalization
WRITE_EXT = ".png"

# Clean/create destination
if DST_ROOT.exists():
    shutil.rmtree(DST_ROOT)
DST_ROOT.mkdir(parents=True, exist_ok=True)

def load_grayscale(path: Path) -> np.ndarray:
    """Read image as grayscale float32 in range [0, 255]"""
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Could not read {path}")
    return img.astype(np.float32)

def downscale(img: np.ndarray, size=(192, 192)) -> np.ndarray:
    """Downscale using INTER_AREA (best for downsampling)"""
    return cv2.resize(img, size, interpolation=cv2.INTER_AREA)

def normalize(img: np.ndarray, clip_pct=(1, 99), mode="minmax") -> np.ndarray:
    """
    Robust per-image normalization
    - Clip to percentiles (reduces outliers/noise)
    - Min-max normalization to [0,1]
    """
    lo, hi = np.percentile(img, clip_pct)
    img = np.clip(img, lo, hi)
    if mode == "minmax":
        denom = (hi - lo) if hi > lo else 1.0
        img = (img - lo) / denom
    elif mode == "zscore":
        mu, sigma = img.mean(), img.std()
        img = (img - mu) / (sigma + 1e-8)
        img = (img - img.min()) / (img.max() - img.min() + 1e-8)
    else:
        raise ValueError("mode must be 'minmax' or 'zscore'")
    return img

def process_one(src_path: Path, dst_path: Path):
    """Process single image: load, downscale, normalize, save"""
    img = load_grayscale(src_path)
    img = downscale(img, TARGET_SIZE)
    img = normalize(img, CLIP_PCT, "minmax")
    # Save as 8-bit PNG
    to_save = (img * 255.0).round().astype(np.uint8)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    cv2.imwrite(str(dst_path.with_suffix(WRITE_EXT)), to_save)

# Process Training/ and Testing/ keeping structure
valid_exts = {".png", ".jpg", ".jpeg", ".bmp"}
for split in ["Training", "Testing"]:
    for cls_dir in (SRC_ROOT / split).iterdir():
        if not cls_dir.is_dir(): 
            continue
        for f in cls_dir.iterdir():
            if f.suffix.lower() in valid_exts:
                rel = f.relative_to(SRC_ROOT)
                out = DST_ROOT / rel
                process_one(f, out)

print("Preprocessing complete")
print("Source:", SRC_ROOT)
print("Output:", DST_ROOT)

In [11]:
# --- (optional) small robustness tweaks ---
valid_exts = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}

def process_one(src_path: Path, dst_path: Path):
    img = load_grayscale(src_path)                     # [H,W], float32 in [0,255]
    img = downscale(img, TARGET_SIZE)                  # reduce resolution
    img = normalize(img, CLIP_PCT, "minmax")           # -> [0,1]
    # save as 8-bit PNG
    to_save = (img * 255.0).round().astype(np.uint8)
    dst_path = dst_path.with_suffix(WRITE_EXT)         # force .png
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    ok = cv2.imwrite(str(dst_path), to_save)
    if not ok:
        raise IOError(f"Failed to write {dst_path}")

In [None]:
import random
import matplotlib.pyplot as plt

SRC_ROOT = Path("../../data/brain-tumor-mri-dataset")
DST_ROOT = Path("../../data/brain-tumor-mri-preproc")
valid_exts = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}

def show_before_after(split: str, n_per_class: int = 2):
    """
    For each class in split, show original vs preprocessed side-by-side
    """
    split_src = SRC_ROOT / split
    classes = [d for d in split_src.iterdir() if d.is_dir()]
    if not classes:
        print(f"No classes found under {split_src}")
        return

    for cls_dir in classes:
        files = [f for f in cls_dir.iterdir() if f.suffix.lower() in valid_exts]
        if not files:
            print(f"(skip) no images in {cls_dir}")
            continue

        picks = random.sample(files, k=min(n_per_class, len(files)))
        for src in picks:
            rel = src.relative_to(SRC_ROOT)
            dst = (DST_ROOT / rel).with_suffix(".png")

            orig = cv2.imread(str(src), cv2.IMREAD_GRAYSCALE)
            proc = cv2.imread(str(dst), cv2.IMREAD_GRAYSCALE)

            if orig is None:
                print(f"(warn) cannot read original: {src}")
                continue
            if proc is None:
                print(f"(warn) cannot read processed: {dst}")
                continue

            plt.figure(figsize=(8, 3))
            plt.suptitle(f"{split} / {cls_dir.name} - {src.name}", y=1.02)
            plt.subplot(1, 2, 1)
            plt.imshow(orig, cmap="gray")
            plt.title("Original")
            plt.axis("off")

            plt.subplot(1, 2, 2)
            plt.imshow(proc, cmap="gray")
            plt.title("Preprocessed")
            plt.axis("off")
            plt.tight_layout()
            plt.show()

print("Visual check - Training set:")
show_before_after("Training", n_per_class=2)

print("\nVisual check - Test set:")
show_before_after("Testing", n_per_class=2)

## 3. First CNN as Baseline

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path

# Define directories
train_dir = Path("../../data/brain-tumor-mri-preproc/Training")

# Dataset setup
IMG_SIZE = (192, 192)
BATCH_SIZE = 32
SEED = 1337
VAL_SPLIT = 0.2

train_ds = keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=VAL_SPLIT,
    subset="training",
    seed=SEED,
    image_size=IMG_SIZE,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
)

val_ds = keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=VAL_SPLIT,
    subset="validation",
    seed=SEED,
    image_size=IMG_SIZE,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
)

print("Classes:", train_ds.class_names)

# Build CNN model
inputs = keras.Input(shape=IMG_SIZE + (1,))
x = layers.Conv2D(32, 3, padding="same", activation="relu")(inputs)
x = layers.MaxPool2D()(x)

x = layers.Conv2D(64, 3, padding="same", activation="relu")(x)
x = layers.MaxPool2D()(x)

x = layers.Conv2D(128, 3, padding="same", activation="relu")(x)
x = layers.MaxPool2D()(x)

x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(len(train_ds.class_names), activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.summary()

# Compile & train
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
)

In [None]:
from tensorflow import keras
from pathlib import Path

# Test set path
test_dir = Path("../../data/brain-tumor-mri-preproc/Testing")

IMG_SIZE = (192, 192)
BATCH_SIZE = 32

# Load test dataset
test_ds = keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=IMG_SIZE,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Evaluate
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

In [None]:
import tensorflow as tf

# Export as SavedModel (for inference)
export_dir = "brain_tumor_cnn_export"
model.export(export_dir)

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
tflite_model = converter.convert()

# Save .tflite file
tflite_path = "brain_tumor_cnn.tflite"
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

print(f"TFLite model saved to: {tflite_path}")

In [None]:
import random
import cv2
import matplotlib.pyplot as plt
from pathlib import Path

# Test set folder
test_dir = Path("../../data/brain-tumor-mri-preproc/Testing")

# List all class folders
classes = [d for d in test_dir.iterdir() if d.is_dir()]

# Choose random class & image
cls = random.choice(classes)
imgs = list(cls.glob("*.png"))
img_path = random.choice(imgs)

# Load image
img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

print(f"Example test image: {img_path}")

# Display
plt.imshow(img, cmap='gray')
plt.title(f"{cls.name} - {img_path.name}")
plt.axis("off")
plt.show()

# Save for frontend
example_export = "example_test_image.png"
cv2.imwrite(example_export, img)
print(f"Exported to: {example_export}")