In [4]:
# In Cell 1, find this line:
!pip install -q scipy scikit-learn opencv-python-headless tqdm h5py

# Change it to this (just add 'mat73'):
!pip install -q scipy scikit-learn opencv-python-headless tqdm h5py mat73

In [5]:
import mat73  # <-- NEW IMPORT
import os
import cv2
import numpy as np
import tqdm

# --- Configuration ---
SOURCE_DIR = 'Downloaded_MAT_Files'
OUTPUT_DIR = 'Figshare_Images'
LABEL_MAP = {
    1: 'meningioma',
    2: 'glioma',
    3: 'pituitary'
}
# --- End Configuration ---

print(f"--- Starting .mat to .jpg Conversion (Using mat73) ---")
print(f"Source: {SOURCE_DIR}")
print(f"Output: {OUTPUT_DIR}")

os.makedirs(OUTPUT_DIR, exist_ok=True)
for class_name in LABEL_MAP.values():
    os.makedirs(os.path.join(OUTPUT_DIR, class_name), exist_ok=True)

patient_image_count = {}

for i in tqdm.tqdm(range(1, 3065)):
    mat_path = os.path.join(SOURCE_DIR, f"{i}.mat")

    if not os.path.exists(mat_path):
        continue

    try:
        # --- THIS IS THE FIX ---
        # We are using mat73.loadmat, NOT scipy.io.loadmat
        data = mat73.loadmat(mat_path)
        # --- END FIX ---

    except Exception as e:
        print(f"Error loading {mat_path}: {e}")
        continue

    cjdata = data.get('cjdata')
    if cjdata is None:
        print(f"Error: 'cjdata' not found in {mat_path}. Skipping.")
        continue

    # Extract data (mat73 loads structs as dicts, so no [0][0])
    try:
        label = int(cjdata['label'])
        pid = str(cjdata['PID'])
        image = cjdata['image'].astype(np.float32)
    except Exception as e:
        print(f"Error reading struct from {mat_path}: {e}")
        continue

    # --- Image Processing ---
    image_norm = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX)
    image_8bit = np.uint8(image_norm)
    image_rgb = cv2.cvtColor(image_8bit, cv2.COLOR_GRAY2RGB)

    # --- File Naming ---
    class_name = LABEL_MAP[label]

    if pid not in patient_image_count:
        patient_image_count[pid] = 0
    patient_image_count[pid] += 1
    img_num = patient_image_count[pid]

    filename = f"patient_{pid}_img_{img_num:04d}.jpg"

    # --- Save the File ---
    save_path = os.path.join(OUTPUT_DIR, class_name, filename)
    cv2.imwrite(save_path, image_rgb)

print("\n" + "="*50)
print("Conversion Complete!")
print(f"All images saved in '{OUTPUT_DIR}' and sorted by class.")
print("="*50)

--- Starting .mat to .jpg Conversion (Using mat73) ---
Source: Downloaded_MAT_Files
Output: Figshare_Images


100%|██████████| 3064/3064 [00:37<00:00, 80.93it/s]


Conversion Complete!
All images saved in 'Figshare_Images' and sorted by class.





In [6]:
import os
import shutil
import json
from collections import defaultdict
from sklearn.model_selection import train_test_split

def create_patient_split(source_dir='Figshare_Images',
                        output_dir='Figshare_Dataset',
                        train_ratio=0.70,
                        val_ratio=0.15,
                        test_ratio=0.15,
                        random_state=42):
    """
    Creates a 3-way, patient-level, stratified split.
    """

    if os.path.exists(output_dir):
        print(f"Warning: Removing old directory: {output_dir}")
        shutil.rmtree(output_dir)

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001, \
        "Ratios must sum to 1.0!"

    print("="*60)
    print("CREATING PATIENT-LEVEL SPLIT (70/15/15)")
    print("="*60)

    total_stats = {'Training': 0, 'Validation': 0, 'Testing': 0}

    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        print(f"\nProcessing class: {class_name}")

        patient_images = defaultdict(list)
        for img_file in os.listdir(class_path):
            if not (img_file.endswith('.jpg') or img_file.endswith('.png')):
                continue

            try:
                patient_id = img_file.split('_')[1]
            except Exception as e:
                print(f"  Skipping malformed file: {img_file} ({e})")
                continue
            patient_images[patient_id].append(img_file)

        patients = list(patient_images.keys())
        total_images = sum(len(imgs) for imgs in patient_images.values())

        print(f"  Total patients: {len(patients)}")
        print(f"  Total images: {total_images}")

        train_patients, temp_patients = train_test_split(
            patients,
            test_size=(val_ratio + test_ratio),
            random_state=random_state,
            shuffle=True
        )

        val_patients, test_patients = train_test_split(
            temp_patients,
            test_size=test_ratio / (val_ratio + test_ratio),
            random_state=random_state,
            shuffle=True
        )

        print(f"  Train patients: {len(train_patients)}")
        print(f"  Val patients: {len(val_patients)}")
        print(f"  Test patients: {len(test_patients)}")

        splits = {
            'Training': train_patients,
            'Validation': val_patients,
            'Testing': test_patients
        }

        for split_name, patient_list in splits.items():
            split_dir = os.path.join(output_dir, split_name, class_name)
            os.makedirs(split_dir, exist_ok=True)

            image_count = 0
            for patient_id in patient_list:
                for img_file in patient_images[patient_id]:
                    src = os.path.join(class_path, img_file)
                    dst = os.path.join(split_dir, img_file)
                    shutil.copy2(src, dst)
                    image_count += 1

            print(f"  {split_name}: {image_count} images ({image_count/total_images*100:.1f}%)")
            total_stats[split_name] += image_count

    print("\n" + "="*60)
    print("SPLIT COMPLETE!")
    print("="*60)

    total_all = sum(total_stats.values())
    for split_name, count in total_stats.items():
        percentage = (count / total_all * 100) if total_all > 0 else 0
        print(f"{split_name}: {count} images ({percentage:.1f}%)")

    print(f"TOTAL: {total_all} images (Should be 3064)")
    print("="*60)
    print(f"✅ Your leak-proof dataset is ready at: {output_dir}")
    print("="*60)

# --- RUN THE SPLITTING SCRIPT ---
create_patient_split()

CREATING PATIENT-LEVEL SPLIT (70/15/15)

Processing class: pituitary
  Total patients: 62
  Total images: 930
  Train patients: 43
  Val patients: 9
  Test patients: 10
  Training: 648 images (69.7%)
  Validation: 130 images (14.0%)
  Testing: 152 images (16.3%)

Processing class: meningioma
  Total patients: 82
  Total images: 708
  Train patients: 57
  Val patients: 12
  Test patients: 13
  Training: 509 images (71.9%)
  Validation: 80 images (11.3%)
  Testing: 119 images (16.8%)

Processing class: glioma
  Total patients: 89
  Total images: 1426
  Train patients: 62
  Val patients: 13
  Test patients: 14
  Training: 1002 images (70.3%)
  Validation: 223 images (15.6%)
  Testing: 201 images (14.1%)

SPLIT COMPLETE!
Training: 2159 images (70.5%)
Validation: 433 images (14.1%)
Testing: 472 images (15.4%)
TOTAL: 3064 images (Should be 3064)
✅ Your leak-proof dataset is ready at: Figshare_Dataset


In [7]:
import os

def get_patient_ids_from_folder(folder_path):
    """
    Scans a directory (e.g., 'Training') and all its subfolders
    (e.g., 'glioma') to find all unique patient IDs.
    """
    patient_ids = set()

    if not os.path.exists(folder_path):
        print(f"Error: Folder not found: {folder_path}")
        return patient_ids

    for class_name in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_name)
        if os.path.isdir(class_path):
            for img_file in os.listdir(class_path):
                try:
                    patient_id = img_file.split('_')[1]
                    patient_ids.add(patient_id)
                except Exception:
                    print(f"  Could not parse ID from: {img_file}")
    return patient_ids

# --- Main Verification Logic ---
print("="*60)
print("RUNNING PATIENT-LEVEL DATA LEAKAGE CHECK...")
print("="*60)

DATASET_DIR = 'Figshare_Dataset'

print("Scanning Training set...")
train_patients = get_patient_ids_from_folder(os.path.join(DATASET_DIR, 'Training'))
print(f"Found {len(train_patients)} unique patients in Training set.\n")

print("Scanning Validation set...")
val_patients = get_patient_ids_from_folder(os.path.join(DATASET_DIR, 'Validation'))
print(f"Found {len(val_patients)} unique patients in Validation set.\n")

print("Scanning Testing set...")
test_patients = get_patient_ids_from_folder(os.path.join(DATASET_DIR, 'Testing'))
print(f"Found {len(test_patients)} unique patients in Testing set.\n")

# --- Leakage Analysis ---
print("="*60)
print("LEAKAGE ANALYSIS:")
print("="*60)

tv_leakage = train_patients.intersection(val_patients)
tt_leakage = train_patients.intersection(test_patients)
vt_leakage = val_patients.intersection(test_patients)

total_leakage = len(tv_leakage) + len(tt_leakage) + len(vt_leakage)

if total_leakage == 0:
    print("✅✅✅ SUCCESS! ✅✅✅")
    print("No patient-level data leakage found between any splits.")
else:
    print("❌❌❌ FAILURE! DATA LEAKAGE DETECTED! ❌❌❌")
    if len(tv_leakage) > 0:
        print(f"  Leakage (Train <-> Val): {len(tv_leakage)} patients. {tv_leakage}")
    if len(tt_leakage) > 0:
        print(f"  Leakage (Train <-> Test): {len(tt_leakage)} patients. {tt_leakage}")
    if len(vt_leakage) > 0:
        print(f"  Leakage (Val <-> Test): {len(vt_leakage)} patients. {vt_leakage}")

print("\n--- Verification Complete ---")

RUNNING PATIENT-LEVEL DATA LEAKAGE CHECK...
Scanning Training set...
Found 162 unique patients in Training set.

Scanning Validation set...
Found 34 unique patients in Validation set.

Scanning Testing set...
Found 37 unique patients in Testing set.

LEAKAGE ANALYSIS:
✅✅✅ SUCCESS! ✅✅✅
No patient-level data leakage found between any splits.

--- Verification Complete ---


In [9]:
# --- Step 1: Install 'imagehash' library ---
# This MUST be at the very top of the cell
!pip install -q imagehash

# --- Now we can import it ---
import glob
from PIL import Image
import imagehash
from collections import defaultdict
import tqdm

print("\n--- Step 2: Scanning for Duplicate Images ---")
all_image_paths = glob.glob('Figshare_Dataset/**/*.jpg', recursive=True)
hash_dict = defaultdict(list)

print(f"Calculating perceptual hashes for {len(all_image_paths)} images...")
for img_path in tqdm.tqdm(all_image_paths):
    try:
        img = Image.open(img_path)
        hash_val = imagehash.phash(img)
        hash_dict[hash_val].append(img_path)
    except Exception as e:
        print(f"Could not process {img_path}: {e}")

# --- Step 3: Reporting Duplicates ---
print("\n" + "="*60)
print("DUPLICATE IMAGE ANALYSIS:")
print("="*60)

duplicates_found = False
total_duplicate_files = 0
for hash_val, file_list in hash_dict.items():
    if len(file_list) > 1:
        duplicates_found = True
        print(f"\n❌ Found Visually Identical Images (Hash: {hash_val}):")
        for file_path in file_list:
            print(f"  -> {file_path}")
        total_duplicate_files += len(file_list)

if not duplicates_found:
    print("✅✅✅ SUCCESS! ✅✅✅")
    print("No visually identical duplicate images were found in the dataset.")
else:
    print("\n--- Summary ---")
    print(f"Found {total_duplicate_files} files that are visually identical.")

print("\n--- Verification Complete ---")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/296.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h
--- Step 2: Scanning for Duplicate Images ---
Calculating perceptual hashes for 3064 images...


100%|██████████| 3064/3064 [00:11<00:00, 274.22it/s]


DUPLICATE IMAGE ANALYSIS:

❌ Found Visually Identical Images (Hash: d0610b6e2f3c3f31):
  -> Figshare_Dataset/Testing/pituitary/patient_103046_img_0003.jpg
  -> Figshare_Dataset/Testing/pituitary/patient_103046_img_0002.jpg

❌ Found Visually Identical Images (Hash: 975a7aa161966768):
  -> Figshare_Dataset/Testing/meningioma/patient_112552_img_0002.jpg
  -> Figshare_Dataset/Testing/meningioma/patient_112552_img_0001.jpg

❌ Found Visually Identical Images (Hash: d23c2de065c7368e):
  -> Figshare_Dataset/Testing/meningioma/patient_104281_img_0004.jpg
  -> Figshare_Dataset/Testing/meningioma/patient_104281_img_0005.jpg

❌ Found Visually Identical Images (Hash: 85366a7a38e131cf):
  -> Figshare_Dataset/Testing/glioma/patient_MR039473B_img_0002.jpg
  -> Figshare_Dataset/Testing/glioma/patient_MR039473B_img_0003.jpg

❌ Found Visually Identical Images (Hash: c23a3de131c7668e):
  -> Figshare_Dataset/Testing/glioma/patient_MR051586_img_0004.jpg
  -> Figshare_Dataset/Testing/glioma/patient_MR051586




In [10]:
from google.colab import drive
import time

print("Mounting Google Drive...")
drive.mount('/content/drive')

# This is the name of the file we will create in your Drive
ZIP_FILE_PATH = '/content/drive/MyDrive/Figshare_Dataset_Clean.zip'
print(f"Your dataset will be saved to: {ZIP_FILE_PATH}")

print("\nZipping the 'Figshare_Dataset' folder... (This may take 2-5 minutes)")
start_time = time.time()

# -r = recursive (for folders)
# -q = quiet (to hide the 3064 file names)
!zip -rq {ZIP_FILE_PATH} Figshare_Dataset

end_time = time.time()
print(f"\n✅ SUCCESS!")
print(f"File 'Figshare_Dataset_Clean.zip' is now saved in your Google Drive.")
print(f"Zipping took {int(end_time - start_time)} seconds.")

Mounting Google Drive...
Mounted at /content/drive
Your dataset will be saved to: /content/drive/MyDrive/Figshare_Dataset_Clean.zip

Zipping the 'Figshare_Dataset' folder... (This may take 2-5 minutes)

✅ SUCCESS!
File 'Figshare_Dataset_Clean.zip' is now saved in your Google Drive.
Zipping took 9 seconds.


In [11]:
from google.colab import files
import time

ZIP_FILE_NAME = 'Figshare_Dataset_Clean.zip'

print("Zipping the 'Figshare_Dataset' folder... (This may take 2-5 minutes)")
start_time = time.time()

!zip -rq {ZIP_FILE_NAME} Figshare_Dataset

end_time = time.time()
print(f"Zipping took {int(end_time - start_time)} seconds.")

print(f"\nStarting download for {ZIP_FILE_NAME}...")
print("⚠️ This may take a very long time and can fail for large files.")
files.download(ZIP_FILE_NAME)

Zipping the 'Figshare_Dataset' folder... (This may take 2-5 minutes)
Zipping took 6 seconds.

Starting download for Figshare_Dataset_Clean.zip...
⚠️ This may take a very long time and can fail for large files.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>