In [3]:
import nibabel as nib
import numpy as np
import os
import pandas as pd
from PIL import Image
import xml.etree.ElementTree as ET
from skimage.transform import resize
from skimage import img_as_ubyte  # Additional import to safely convert to ubyte

def convert_img_to_jpegs(input_img_path, output_folder, img_dim=(224, 224)):
    # Load the .img file using nibabel
    img = nib.load(input_img_path)
    data = img.get_fdata()

    # Normalize the data to 0-255
    data = 255 * (data - np.min(data)) / (np.max(data) - np.min(data))
    data = data.astype(np.uint8)

    output_paths = []

    # Iterate over all slices in the 3rd dimension
    for slice_index in range(data.shape[2]):
        slice_data = data[:, :, slice_index]
        
        # Resize image if needed
        if slice_data.shape != img_dim:
            slice_data = resize(slice_data, img_dim, anti_aliasing=True)
            slice_data = img_as_ubyte(slice_data)  # Ensure type is uint8 and avoid overflow
        
        # Ensure the data is 2D for PIL conversion
        if len(slice_data.shape) != 2:
            slice_data = slice_data.squeeze()  # Removes any singleton dimensions
        
        # Convert to PIL image and save as JPEG
        img_pil = Image.fromarray(slice_data)
        img_filename = f"{os.path.basename(input_img_path).replace('.img', '')}_slice_{slice_index}.jpeg"
        output_path = os.path.join(output_folder, img_filename)
        img_pil.save(output_path, "JPEG")
        output_paths.append(output_path)

    return output_paths


def extract_metadata_from_xml(xml_file_path):
    """Extracts metadata from XML file."""
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    metadata = {}
    for child in root:
        metadata[child.tag] = child.text
    return metadata

def process_images(base_dir, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    records = []
    # Walk through the directory structure
    for root, dirs, files in os.walk(base_dir):
        # Find only the .img files in the T88_111 subdirectory
        if 'T88_111' in root:
            img_files = [f for f in files if f.endswith('.img')]
            subject_id = root.split(os.sep)[-4]  # Assuming 'OAS1_xxxx_MRy' is 4 levels up
            session_id = root.split(os.sep)[-3]  # Session folder is directly 3 levels up
            
            # Path to the XML file
            xml_path = os.path.join(os.sep.join(root.split(os.sep)[:-2]), f'{session_id}.xml')

            # Extract metadata if XML file exists
            metadata = {}
            if os.path.exists(xml_path):
                metadata = extract_metadata_from_xml(xml_path)

            # Process each image file found in the T88_111 directory
            for img_file in img_files:
                img_path = os.path.join(root, img_file)
                output_paths = convert_img_to_jpegs(img_path, output_dir)
                
                # Store metadata and paths for each slice
                for output_path in output_paths:
                    records.append({
                        "file_path": output_path,
                        "subject_id": subject_id,
                        "session_id": session_id,
                        **metadata  # Merge additional metadata from XML
                    })

    # Create a DataFrame with all records
    df = pd.DataFrame(records)
    return df

# Example usage:
base_directory = 'Data/OASIS1'
output_directory = 'Data/OASIS_Extracted'
df_images = process_images(base_directory, output_directory)
print(df_images.head())



                                           file_path     subject_id session_id
0  Data/OASIS_Extracted/OAS1_0448_MR1_mpr_n4_anon...  OAS1_0448_MR1  PROCESSED
1  Data/OASIS_Extracted/OAS1_0448_MR1_mpr_n4_anon...  OAS1_0448_MR1  PROCESSED
2  Data/OASIS_Extracted/OAS1_0448_MR1_mpr_n4_anon...  OAS1_0448_MR1  PROCESSED
3  Data/OASIS_Extracted/OAS1_0448_MR1_mpr_n4_anon...  OAS1_0448_MR1  PROCESSED
4  Data/OASIS_Extracted/OAS1_0448_MR1_mpr_n4_anon...  OAS1_0448_MR1  PROCESSED
