# ML Final Project

### File extraction from zip

In [1]:
import shutil
import os
import zipfile

In [2]:
if os.path.exists('data'):
    shutil.rmtree('data')

In [3]:
# Define the folder name or path
folder_name = "data"

# Create the folder
os.makedirs(folder_name, exist_ok=True)  

# Path to zipped files
base_zip_path = 'data_zipped' 

# Path for extracted files
target_base_path = 'data'   

# Create the target base path 
os.makedirs(target_base_path, exist_ok=True)

# Check if the base_zip_path exists
if not os.path.isdir(base_zip_path):
    print(f"Error: The directory '{base_zip_path}' was not found.")
else:
    # List all files in the base_zip_path
    all_files = os.listdir(base_zip_path)
    
    # Filter for .zip files
    zip_files_to_extract = [f for f in all_files if f.lower().endswith('.zip')]

    if not zip_files_to_extract:
        print(f"No .zip files found in '{base_zip_path}'.")
    else:
        print(f"Found the following zip files to extract: {zip_files_to_extract}")

        for zip_filename in zip_files_to_extract:
            zip_file_path = os.path.join(base_zip_path, zip_filename)
            
            # Create a new folder name from the zip file name (without .zip)
            folder_name = os.path.splitext(zip_filename)[0]
            output_folder_path = os.path.join(target_base_path, folder_name)
            
            # Create the specific output folder 
            os.makedirs(output_folder_path, exist_ok=True)
            
            try:
                with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                    zip_ref.extractall(output_folder_path)
                    print(f"Successfully extracted '{zip_filename}' to '{output_folder_path}'")
                    
            except Exception as e:
                print(f"Error extracting '{zip_filename}': {e}")

        print("Extraction process complete.")

Found the following zip files to extract: ['1512427.zip', 'archive.zip', 'Brain Tumor data.zip']
Successfully extracted '1512427.zip' to 'data\1512427'
Successfully extracted 'archive.zip' to 'data\archive'
Successfully extracted 'Brain Tumor data.zip' to 'data\Brain Tumor data'
Extraction process complete.


The PMRAM is divided in two folders, one being augmented data and one being raw data, we will drop the augmented data one and augment the raw data ourselves in the data augmentation section

In [4]:
#I had to adjust the path and add a second archive because it created a duplicated version when unzipping in the 
#previous passage. check again (SOLVED). 
#however we might have some problems if somnath automatically unzip files, but I don't think it would be the case
shutil.rmtree('data/archive/PMRAM Bangladeshi Brain Cancer - MRI Dataset/PMRAM Bangladeshi Brain Cancer - MRI Dataset/Augmented Data')

In [5]:
# change folder structure to make it easier to work with

# Path to folder with actual data
deep_folder = 'data/archive/PMRAM Bangladeshi Brain Cancer - MRI Dataset/PMRAM Bangladeshi Brain Cancer - MRI Dataset/Raw Data/Raw'

# Desired new location
new_base = 'data/PMRAM_Dataset/Raw Data'

# Create the new destination
os.makedirs(new_base, exist_ok=True)

# Move contents
for item in os.listdir(deep_folder):
    src_path = os.path.join(deep_folder, item)
    dst_path = os.path.join(new_base, item)
    shutil.move(src_path, dst_path)

# Delete the old empty folder tree
shutil.rmtree('data/archive')


Database Brain Tumor data is already in jpg format, and divided in training and testing splits and classes, we will change the split ourselves later. 
For now we remove the duplicated folder and change its name to improve readability.


Database 1512427 contains various sub-folders made up of 4 .zip files with each .zip file containing 766 slices as can be read in the original README, we will unzip them and move them in a single folder.

In [6]:
# Define the source directory containing the zip files
source_zip_dir = 'data/1512427'

# Define the target base directory for unzipped folders 
target_extract_base_dir = 'data/1512427'

# Ensure the source directory exists
if not os.path.isdir(source_zip_dir):
    print(f"Error: Source directory '{source_zip_dir}' not found.")
else:
    zip_files_found = [f for f in os.listdir(source_zip_dir) if f.lower().endswith('.zip')]

    if not zip_files_found:
        print(f"No .zip files found in '{source_zip_dir}'.")
    else:
        print(f"Found the following zip files to extract: {zip_files_found}")

        for zip_filename in zip_files_found:
            zip_file_full_path = os.path.join(source_zip_dir, zip_filename)
            
            # Create a folder name from the zip file name (without .zip)
            extraction_folder_name = os.path.splitext(zip_filename)[0]
            output_folder_path = os.path.join(target_extract_base_dir, extraction_folder_name)
            
            # Create the specific output folder 
            os.makedirs(output_folder_path, exist_ok=True)
            
            try:
                with zipfile.ZipFile(zip_file_full_path, 'r') as zip_ref:
                    zip_ref.extractall(output_folder_path)
                    print(f"Successfully extracted '{zip_filename}' to '{output_folder_path}'")
                
                # Delete the zip file after successful extraction
                os.remove(zip_file_full_path)
                print(f"Successfully deleted '{zip_filename}'")
                
            except Exception as e:
                print(f"Error during processing of '{zip_filename}': {e}")
                
        print("\nUnzipping and deletion process complete.")


Found the following zip files to extract: ['brainTumorDataPublic_1-766.zip', 'brainTumorDataPublic_1533-2298.zip', 'brainTumorDataPublic_2299-3064.zip', 'brainTumorDataPublic_767-1532.zip']
Successfully extracted 'brainTumorDataPublic_1-766.zip' to 'data/1512427\brainTumorDataPublic_1-766'
Successfully deleted 'brainTumorDataPublic_1-766.zip'
Successfully extracted 'brainTumorDataPublic_1533-2298.zip' to 'data/1512427\brainTumorDataPublic_1533-2298'
Successfully deleted 'brainTumorDataPublic_1533-2298.zip'
Successfully extracted 'brainTumorDataPublic_2299-3064.zip' to 'data/1512427\brainTumorDataPublic_2299-3064'
Successfully deleted 'brainTumorDataPublic_2299-3064.zip'
Successfully extracted 'brainTumorDataPublic_767-1532.zip' to 'data/1512427\brainTumorDataPublic_767-1532'
Successfully deleted 'brainTumorDataPublic_767-1532.zip'

Unzipping and deletion process complete.


### converting MAT files into JPG

In [7]:
import h5py
import numpy as np
from PIL import Image


base_folder = 'data/1512427'


input_folders = [
    'brainTumorDataPublic_1-766',
    'brainTumorDataPublic_767-1532',
    'brainTumorDataPublic_1533-2298',
    'brainTumorDataPublic_2299-3064'
]


# Output base folder
output_folder = ('data/China_Dataset')

os.makedirs(output_folder, exist_ok=True)

# Label to class name mapping
label_map = {
    1: 'meningioma',
    2: 'glioma',
    3: 'pituitary'
}

# Loop through input folders
for folder_name in input_folders:
    input_folder_path = os.path.join(base_folder, folder_name)
    mat_files = [f for f in os.listdir(input_folder_path) if f.endswith('.mat')]

    for mat_file in mat_files:
        mat_file_path = os.path.join(input_folder_path, mat_file)

        # Load .mat file
        with h5py.File(mat_file_path, 'r') as f:
            image = np.array(f['cjdata']['image']).T
            label = int(np.array(f['cjdata']['label'])[0][0])

        # Normalize image
        image = image.astype(np.float64)
        image = ((image - image.min()) / (image.max() - image.min()) * 255).astype(np.uint8)

        # Get class name from label
        class_name = label_map.get(label, 'unknown')
        class_folder = os.path.join(output_folder, class_name)
        os.makedirs(class_folder, exist_ok=True)

        # Save image
        base_name = os.path.splitext(mat_file)[0]
        output_path = os.path.join(class_folder, base_name + '.jpg')
        Image.fromarray(image).save(output_path)

# remove old folder containing MAT files
shutil.rmtree('data/1512427')


### Now all the datasets are in jpg format. 
We will now proceed to do augmentation where needed and then we will move them all into a single folder to then start with model building.

#  

Definig the augmentation procedures

In [8]:
def rotate_image(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    mat = cv2.getRotationMatrix2D(center, angle, 1.0)
    return cv2.warpAffine(image, mat, (w, h), borderMode=cv2.BORDER_REFLECT)

def zoom_image(image, zoom_factor):
    h, w = image.shape[:2]
    new_h, new_w = int(h * zoom_factor), int(w * zoom_factor)

    resized = cv2.resize(image, (new_w, new_h))

    if zoom_factor > 1:
        # Crop center
        start_x = (new_w - w) // 2
        start_y = (new_h - h) // 2
        return resized[start_y:start_y + h, start_x:start_x + w]
    else:
        # Pad
        pad_x = (w - new_w) // 2
        pad_y = (h - new_h) // 2
        padded = cv2.copyMakeBorder(
            resized, pad_y, h - new_h - pad_y, pad_x, w - new_w - pad_x,
            borderType=cv2.BORDER_REFLECT
        )
        return padded

def apply_light_blur(image, kernel_size, sigma):
    return cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma)

def process_with_specific_augmentations(img_path, output_dir, prefix):
    img = cv2.imread(img_path)
    if img is None:
        return
    img = cv2.resize(img, (256, 256))

    cv2.imwrite(os.path.join(output_dir, f"{prefix}_original.jpg"), img)
    
    rotated_cw = rotate_image(img, -10)
    cv2.imwrite(os.path.join(output_dir, f"{prefix}_rot_cw.jpg"), rotated_cw)

    rotated_ccw = rotate_image(img, 10)
    cv2.imwrite(os.path.join(output_dir, f"{prefix}_rot_ccw.jpg"), rotated_ccw)

    zoom_in = zoom_image(img, 1.1)
    cv2.imwrite(os.path.join(output_dir, f"{prefix}_zoom_in.jpg"), zoom_in)

    zoom_out = zoom_image(img, 0.9)
    cv2.imwrite(os.path.join(output_dir, f"{prefix}_zoom_out.jpg"), zoom_out)

    blurred = apply_light_blur(img, 3, 0.05)
    cv2.imwrite(os.path.join(output_dir, f"{prefix}_blurred.jpg"), blurred)


Applying augmentation

In [9]:
import os
import cv2
import numpy as np
#from tensorflow.keras.preprocessing.image import ImageDataGenerator
from pathlib import Path

# Base input and output directories
input_base_dir = "data"
if os.path.exists('processed_data'):
    shutil.rmtree('processed_data')
output_base_dir = "processed_data"
os.makedirs(output_base_dir, exist_ok=True)


'''
# Image augmentation config: Rotation & Zoom
datagen = ImageDataGenerator(
    rotation_range=10,
    zoom_range=0.1,
    fill_mode='nearest'
)
'''

# Traverse datasets
for dataset_name in os.listdir(input_base_dir):
    dataset_path = os.path.join(input_base_dir, dataset_name)

    if not os.path.isdir(dataset_path):
        continue

    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(root, file)

                # Create output path preserving relative structure
                relative_path = os.path.relpath(root, input_base_dir)
                out_folder = os.path.join(output_base_dir, relative_path)
                os.makedirs(out_folder, exist_ok=True)

                prefix = Path(file).stem
                process_with_specific_augmentations(full_path, out_folder, prefix)

print("Image resizing and augmentation complete.")


Image resizing and augmentation complete.
