In [4]:
import os
import re

data_folder = "Data"

# Define the regex pattern for matching the filenames
pattern = re.compile(r"^OAS1_(\d{4})_MR(\d+)_mpr-(\d{1})_(\d{3})\.jpg$")

# Function to check if a filename matches the pattern


def is_valid_filename(filename):
    return bool(pattern.match(filename))


# Function to iterate through all files and folders and check filenames


def check_file_structure(root_folder):
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if not is_valid_filename(file):
                print(f"Invalid filename found: {file} in directory: {subdir}")
                return False
    return True


# Check the file structure
if check_file_structure(data_folder):
    print("All filenames follow the correct structure.")
else:
    print("Some filenames do not follow the correct structure.")

Invalid filename found: .DS_Store in directory: Data
Some filenames do not follow the correct structure.


In [5]:
import os
from collections import defaultdict
from PIL import Image

# Dictionary to store image paths
image_paths = {
    "Non Demented": [],
    "Very mild Dementia": [],
    "Mild Dementia": [],
    "Moderate Dementia": [],
}

# Traverse through the folder structure
for root, dirs, files in os.walk(data_folder):
    for file in files:
        if file.endswith(".jpg"):
            # Extract dementia level from folder name
            dementia_level = os.path.basename(root)
            # Extract subject ID from file name
            subject_id = file.split("_")[0]
            # Extract image slice number
            slice_number = int(file.split("_")[-1].split(".")[0])
            # Add image path to the corresponding dementia level
            image_paths[dementia_level].append(os.path.join(root, file))


def check_image_formats(image_paths):
    """Check if all images have the same format and size."""
    formats = set()
    sizes = set()
    for paths in image_paths.values():
        for path in paths:
            # Get image format
            format_info = path.split(".")[-1]
            formats.add(format_info)

            # Get image size
            with Image.open(path) as img:
                img_size = img.size
                sizes.add(img_size)

    # Print different formats
    print("Different image formats:")
    for fmt in formats:
        print(fmt)

    # Print different sizes
    print("\nDifferent image sizes:")
    for size in sizes:
        print(size)

    return len(formats) == 1, len(sizes) == 1


check_image_formats(image_paths)

Different image formats:
jpg

Different image sizes:
(496, 248)


(True, True)

In [6]:
import pandas as pd

# Function to extract information from filename


def extract_info_from_filename(filename):
    match = pattern.match(filename)
    subject_id = match.group(1)
    session = match.group(2)
    mpr = match.group(3)
    slice_number = int(match.group(4))
    return subject_id, session, mpr, slice_number


# Function to create DataFrame


def create_dataframe(root_folder):
    data = []
    for subdir, _, files in os.walk(root_folder):
        class_name = os.path.basename(subdir)
        for file in files:
            if is_valid_filename(file):
                subject_id, session, mpr, slice_number = extract_info_from_filename(
                    file
                )
                path = os.path.join(subdir, file)
                data.append((class_name, subject_id, session, mpr, slice_number, path))
    df = pd.DataFrame(
        data, columns=["class", "subject_ID", "session", "mpr", "slice", "path"]
    )
    return df


# Create the DataFrame
df = create_dataframe(data_folder)
df.sort_values(by=["class", "subject_ID", "session", "mpr", "slice"], inplace=True)


# Display the DataFrame
print(df.head())

              class subject_ID session mpr  slice  \
916   Mild Dementia       0028       1   1    100   
1140  Mild Dementia       0028       1   1    101   
836   Mild Dementia       0028       1   1    102   
630   Mild Dementia       0028       1   1    103   
278   Mild Dementia       0028       1   1    104   

                                                path  
916   Data/Mild Dementia/OAS1_0028_MR1_mpr-1_100.jpg  
1140  Data/Mild Dementia/OAS1_0028_MR1_mpr-1_101.jpg  
836   Data/Mild Dementia/OAS1_0028_MR1_mpr-1_102.jpg  
630   Data/Mild Dementia/OAS1_0028_MR1_mpr-1_103.jpg  
278   Data/Mild Dementia/OAS1_0028_MR1_mpr-1_104.jpg  


In [8]:
# Grouping by 'subject_ID', 'session', and 'mpr', and aggregating 'path' as a list
grouped_df = (
    df.groupby(["class", "subject_ID", "session", "mpr"])
    .agg({"path": list})
    .reset_index()
)

# Sorting the paths within each group based on the slice value
grouped_df["path"] = grouped_df["path"].apply(
    lambda paths: sorted(paths, key=lambda x: int(x.split("_")[-1].split(".")[0]))
)

# Renaming the 'path' column to 'paths'
grouped_df = grouped_df.rename(columns={"path": "paths"})

grouped_df

# Save the grouped DataFrame to a JSON file
grouped_df.to_json("Data/alzheimer_data.json", orient="records")