In [11]:
import pandas as pd
df = pd.read_excel(r"references\Subject_info.xlsx")
# Assuming df is your DataFrame
def preprocess_labels(df):
    # Mapping similar categories to a single category
    label_mapping = {
        'EMCI': 'MCI',
        'LMCI': 'MCI',
        'SMC': 'CN'  # If you want SMC to be considered as CN, include this; remove if not
    }
    df['Research Group'] = df['Research Group'].replace(label_mapping)
    return df

preprocess_labels(df)
import pandas as pd

# Assuming df is your DataFrame
df = df[df['Research Group'] != 'AD']

# Initialize an empty DataFrame to hold the balanced data
balanced_df = pd.DataFrame()

# Iterate over each dataset split
for split in df['dataset_split'].unique():
    # Filter the DataFrame for the current split
    split_df = df[df['dataset_split'] == split]
    
    # Find the minimum number of rows for any Research Group within this split
    min_size = split_df['Research Group'].value_counts().min()
    
    # Sample from each group to match the minimum size
    sampled_groups = [group_df.sample(n=min_size, random_state=1) 
                      for name, group_df in split_df.groupby('Research Group')]
    
    # Concatenate the sampled groups into the balanced DataFrame
    balanced_df = pd.concat([balanced_df, *sampled_groups], ignore_index=True)

df = balanced_df
df['Research Group'].value_counts()
df
import pandas as pd

# Assuming 'df' is your DataFrame loaded with the 'Research Group' column available
df['Research Group_INT'] = pd.Categorical(df['Research Group']).codes


In [12]:
import os
import pandas as pd

import os
import pandas as pd
import re  # Import regular expressions

def assign_image_paths(df, directory, column_name):
    # Compile a regular expression pattern for extracting the subject ID
    # Assuming subject IDs always start with numbers and include 'S' followed by more numbers
    subject_pattern = re.compile(r'(\d+_S_\d+)')

    # Dictionary to hold the mapping of subject IDs to file paths
    subject_paths = {}

    # List all files in the directory
    for filename in os.listdir(directory):
        match = subject_pattern.search(filename)
        if match:
            subject_id = match.group(1)  # Extract the subject ID using the regex
            # Check if the subject ID is in the DataFrame
            if subject_id in df['Subject'].values:
                # Create a full path to the file and store it in the dictionary
                subject_paths[subject_id] = os.path.join(directory, filename)

    # Map the found paths back to the DataFrame
    df[column_name] = df['Subject'].map(subject_paths)

    # Check if any subject did not get a path (and thus would have NaN)
    if df[column_name].isnull().any():
        missing_subjects = df[df[column_name].isnull()]['Subject'].tolist()
        raise ValueError(f"No path found for subjects: {missing_subjects}")
    
    return df


# Define paths to the directories
pet_directory = r"D:\Data\Preprocessed\Masked PET"
mri_directory = r"D:\Data\Preprocessed\spatial_normalization"

# Load your DataFrame
df = pd.read_excel("references\Subject_info_balanced.xlsx")

# Assign PET paths
df = assign_image_paths(df, pet_directory, 'PATH_PET')

# Assign MRI paths
df = assign_image_paths(df, mri_directory, 'PATH_MRI')


In [13]:
df = df.rename(columns={'PATH': 'PATH_MRI_PET'})


In [14]:
df

Unnamed: 0,Subject,Sex,Weight,Research Group,APOE A1,APOE A2,Age,dataset_split,File_Path,File_Path_desktop,PATH_MRI_PET,Research Group_INT,PATH_PET,PATH_MRI
0,002_S_1280,F,89.4,CN,3.0,4.0,75.1,train,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\002_S_1280_f...,data\processed\002_S_1280_fused.nii,0,D:\Data\Preprocessed\Masked PET\002_S_1280_Mas...,D:\Data\Preprocessed\spatial_normalization\002...
1,128_S_0863,M,92.1,CN,3.0,3.0,79.3,train,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\128_S_0863_f...,data\processed\128_S_0863_fused.nii,0,D:\Data\Preprocessed\Masked PET\128_S_0863_Mas...,D:\Data\Preprocessed\spatial_normalization\128...
2,002_S_4213,F,80.0,CN,3.0,3.0,78.1,train,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\002_S_4213_f...,data\processed\002_S_4213_fused.nii,0,D:\Data\Preprocessed\Masked PET\002_S_4213_Mas...,D:\Data\Preprocessed\spatial_normalization\002...
3,032_S_5289,F,79.4,CN,3.0,4.0,59.8,train,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\032_S_5289_f...,data\processed\032_S_5289_fused.nii,0,D:\Data\Preprocessed\Masked PET\032_S_5289_Mas...,D:\Data\Preprocessed\spatial_normalization\032...
4,053_S_5296,M,76.0,CN,3.0,3.0,69.3,train,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\053_S_5296_f...,data\processed\053_S_5296_fused.nii,0,D:\Data\Preprocessed\Masked PET\053_S_5296_Mas...,D:\Data\Preprocessed\spatial_normalization\053...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,002_S_4447,F,68.5,MCI,3.0,4.0,69.7,validation,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\002_S_4447_f...,data\processed\002_S_4447_fused.nii,1,D:\Data\Preprocessed\Masked PET\002_S_4447_Mas...,D:\Data\Preprocessed\spatial_normalization\002...
124,022_S_2167,M,71.4,MCI,3.0,3.0,83.2,validation,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\022_S_2167_f...,data\processed\022_S_2167_fused.nii,1,D:\Data\Preprocessed\Masked PET\022_S_2167_Mas...,D:\Data\Preprocessed\spatial_normalization\022...
125,022_S_4805,F,43.5,MCI,3.0,4.0,72.2,validation,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\022_S_4805_f...,data\processed\022_S_4805_fused.nii,1,D:\Data\Preprocessed\Masked PET\022_S_4805_Mas...,D:\Data\Preprocessed\spatial_normalization\022...
126,002_S_4251,M,78.0,MCI,3.0,3.0,72.0,validation,C:\Users\Micha\OneDrive - Høyskolen Kristiania...,D:\Data\Preprocessed\Fused Images\002_S_4251_f...,data\processed\002_S_4251_fused.nii,1,D:\Data\Preprocessed\Masked PET\002_S_4251_Mas...,D:\Data\Preprocessed\spatial_normalization\002...


In [15]:
df.to_excel('references/Subject_info_balanced.xlsx', index=False)