# Kaggle - Brain Tumor MRI Dataset

You can find the dataset and some informations about on the [Kaggle page](https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset).

For details on steps below, please see documentation in the *docs* directory.

## General part

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
import os
from pathlib import Path
import numpy as np

from sklearn.model_selection import train_test_split

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # ignore GPU
import tensorflow as tf
print("TF utilise :", tf.config.list_physical_devices())

In [None]:
# path management
NOTEBOOK_PATH = Path(ipynbname.path())
PROJECT_ROOT = Path(NOTEBOOK_PATH).resolve().parents[1]
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PREP_DIR = PROJECT_ROOT / "data" / "processed"

CLASSES = ["notumor", "glioma", "meningioma", "pituitary"]

# parameters
IMG_SIZE = 260 
SEED = 42
SPLIT = [0.7, 0.30]  # train - validation -> test data are always in a separated directory

## Preprocessing

### Split train/validation

In [None]:
# 1. Récupérer toutes les images de `training/`
train_dir = "raw/training"
all_paths = []

for class_name in os.listdir(train_dir):  # ex: glioma, meningioma, ...
    folder = os.path.join(train_dir, class_name)
    
    for img_name in os.listdir(folder):
        all_paths.append( (os.path.join(folder, img_name), class_name) )

df_train = pd.DataFrame(all_paths, columns=["filepath", "label"])

# 2. Split stratifié → Train + Validation
train_df, val_df = train_test_split(
    df_train,
    test_size=0.2,         # tu peux ajuster ici (ex: 0.1 ou 0.15)
    stratify=df_train["label"],
    random_state=42
)

print(train_df.shape, val_df.shape)

### Calculate clipping bounds

### Pipeline

In [None]:
# EDA version has been change to be compatible with Tensorflow
def crop_black_background_tf(img, thresh=10):
    """
    Crop black background of an image.
    img: tf.Tensor of shape [H, W, C] or [H, W], dtype=tf.float32
    """
    # Convert to grayscale si nécessaire
    if img.shape[-1] == 3:
        img_gray = tf.image.rgb_to_grayscale(img)
    else:
        img_gray = img

    # Create a pixel mask > thresh
    mask = img_gray > thresh

    # Find non-zero coordinates
    coords = tf.where(mask[:, :, 0])  # coords pixels True

    # Security if black image
    def crop():
        y0 = tf.reduce_min(coords[:, 0])
        x0 = tf.reduce_min(coords[:, 1])
        y1 = tf.reduce_max(coords[:, 0])
        x1 = tf.reduce_max(coords[:, 1])
        return img[y0:y1+1, x0:x1+1, :]
    
    def no_crop():
        return img

    return tf.cond(tf.shape(coords)[0] > 0, crop, no_crop)

In [None]:
def preprocess_image_tf(path, label, low_clip, high_clip, target_size=(260, 260)):
    """
    Full TF preprocessing pipeline:
    - read
    - crop black background
    - resize
    - convert to float32
    - z-score normalize (image-wise)
    - clip using percentiles
    """
    # 1. Read image
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels=3)  # [H,W,3]

    # 2. Convert to float32 [0,1]
    img = tf.image.convert_image_dtype(img, tf.float32)

    # 3. Crop black background
    img = crop_black_background_tf(img, thresh=10)

    # 4. Resize
    img = tf.image.resize(img, target_size, method='area')

    # 5. Z-score normalize per image
    mean, variance = tf.nn.moments(img, axes=[0,1,2])
    std = tf.sqrt(variance)
    std = tf.maximum(std, 1e-6)  # sécurité pour éviter division par zéro
    img = (img - mean) / std

    # 6. Clip
    img = tf.clip_by_value(img, low_clip, high_clip)

    return img, label