# Organising the Dataset

In [1]:
import os
import pandas as pd
import shutil

# Path to the folder containing all images
source_folder = "C:/Users/Adison/Downloads/Data"
dest_folder = "C:/Users/Adison/Downloads/Data/Curated OASIS"
# C:\Users\Adison\Downloads\Data

# Path to the Excel file containing the image names and labels
excel_file = source_folder + "/Curated OASIS/patient_list.xlsx"

In [52]:
# Create the destination folders if they don't exist
train_folder = dest_folder + "/Train"
validation_folder = dest_folder + "/Validation"
test_folder = dest_folder + "/Test"

if not os.path.exists(train_folder):
    os.makedirs(train_folder)

if not os.path.exists(validation_folder):
    os.makedirs(validation_folder)

if not os.path.exists(test_folder):
    os.makedirs(test_folder)

In [3]:
# Read the Excel file
df = pd.read_excel(excel_file)

df.head()

Unnamed: 0,id,class
0,1,Non
1,2,Non
2,4,Non
3,5,Non
4,6,Non


# Assigning Patients to Train, Validation or Testing Set

In [4]:
import pandas as pd
import numpy as np

# Split df into three DataFrames based on the "class" column
df_non = df[df['class'] == 'Non']
df_vmild = df[df['class'] == 'Very Mild']
df_mildmoderate = df[df['class'] == 'Mild-Moderate']

# Make copies to prevent SettingWithCopyWarning error later
df_non = df_non.copy()
df_vmild = df_vmild.copy()
df_mildmoderate = df_mildmoderate.copy()

# Define probabilities for assignment
probabilities = {'Train': 0.7, 'Validation': 0.2, 'Test': 0.1}

# Function to assign values based on probabilities
def assign_values():
    return np.random.choice(list(probabilities.keys()), p=list(probabilities.values()))

# Define a function to print raw count and percentage
def print_count_and_percentage(df, title):
    print(title)
    counts = df['assign'].value_counts()
    total = counts.sum()
    for category, count in counts.items():
        percentage = (count / total) * 100
        print(f"{category}: {count} ({percentage:.2f}%)")
    print()

In [5]:
# Assign values to the "assign" column for each DataFrame
df_non['assign'] = np.random.choice(list(probabilities.keys()), size=len(df_non), p=list(probabilities.values()))

print_count_and_percentage(df_non, "Counts and Percentages for df_non:")

Counts and Percentages for df_non:
Train: 185 (69.55%)
Validation: 56 (21.05%)
Test: 25 (9.40%)



In [24]:
df_vmild['assign'] = np.random.choice(list(probabilities.keys()), size=len(df_vmild), p=list(probabilities.values()))

print_count_and_percentage(df_vmild, "Counts and Percentages for df_vmild:")

Counts and Percentages for df_vmild:
Train: 40 (68.97%)
Validation: 13 (22.41%)
Test: 5 (8.62%)



In [49]:
df_mildmoderate['assign'] = np.random.choice(list(probabilities.keys()), size=len(df_mildmoderate), p=list(probabilities.values()))

print_count_and_percentage(df_mildmoderate, "Counts and Percentages for df_mildmoderate:")

Counts and Percentages for df_mildmoderate:
Train: 16 (69.57%)
Validation: 5 (21.74%)
Test: 2 (8.70%)



In [50]:
# Concatenate the three dataframes
combined_df = pd.concat([df_non, df_vmild, df_mildmoderate])

# Export the combined dataframe to a CSV file
combined_df.to_csv('patient_list_assigned.csv', index=False)

# Now, the combined dataframe is exported to a CSV file named "combined_data.csv" in the current directory.


In [83]:
df_non

Unnamed: 0,id,class,assign
0,1,Non,Train
1,2,Non,Train
2,4,Non,Train
3,5,Non,Test
4,6,Non,Train
...,...,...,...
261,376,Non,Train
262,377,Non,Validation
263,378,Non,Train
264,379,Non,Test


# Collecting names of all files

### Mild-Moderate Dementia

In [66]:
# Define source folder
source_folder_mildmod = source_folder + "/Mild-Moderate Dementia"

# Get list of filenames in the source folder
filenames_mildmod = os.listdir(source_folder_mildmod)

# Create a DataFrame with the filenames
df_filenames_mildmod = pd.DataFrame({'filename': filenames_mildmod})

# Display the DataFrame
print("DataFrame size:", df_filenames_mildmod.shape)
df_filenames_mildmod


DataFrame size: (5490, 1)


Unnamed: 0,filename
0,OAS1_0028_MR1_mpr-1_100.jpg
1,OAS1_0028_MR1_mpr-1_101.jpg
2,OAS1_0028_MR1_mpr-1_102.jpg
3,OAS1_0028_MR1_mpr-1_103.jpg
4,OAS1_0028_MR1_mpr-1_104.jpg
...,...
5485,OAS1_0382_MR1_mpr-4_156.jpg
5486,OAS1_0382_MR1_mpr-4_157.jpg
5487,OAS1_0382_MR1_mpr-4_158.jpg
5488,OAS1_0382_MR1_mpr-4_159.jpg


In [67]:
# Function to extract ID from filename
def extract_id(filename):
    # Split filename by underscores
    parts = filename.split('_')
    # Extract ID segment
    id_segment = parts[1]
    # Remove non-numeric characters from ID segment
    id_numeric = ''.join(filter(str.isdigit, id_segment))
    # Convert ID to integer
    id_int = int(id_numeric)
    return id_int

# Apply function to extract ID and create new column
df_filenames_mildmod['id'] = df_filenames_mildmod['filename'].apply(extract_id)

# Perform inner join
df_filenames_mildmod_merged = pd.merge(df_filenames_mildmod, df_mildmoderate, on='id', how='inner')

# Display the DataFrame
print("DataFrame size:", df_filenames_mildmod_merged.shape)
df_filenames_mildmod_merged

DataFrame size: (5490, 4)


Unnamed: 0,filename,id,class,assign
0,OAS1_0028_MR1_mpr-1_100.jpg,28,Mild-Moderate,Train
1,OAS1_0028_MR1_mpr-1_101.jpg,28,Mild-Moderate,Train
2,OAS1_0028_MR1_mpr-1_102.jpg,28,Mild-Moderate,Train
3,OAS1_0028_MR1_mpr-1_103.jpg,28,Mild-Moderate,Train
4,OAS1_0028_MR1_mpr-1_104.jpg,28,Mild-Moderate,Train
...,...,...,...,...
5485,OAS1_0382_MR1_mpr-4_156.jpg,382,Mild-Moderate,Train
5486,OAS1_0382_MR1_mpr-4_157.jpg,382,Mild-Moderate,Train
5487,OAS1_0382_MR1_mpr-4_158.jpg,382,Mild-Moderate,Train
5488,OAS1_0382_MR1_mpr-4_159.jpg,382,Mild-Moderate,Train


### Very Mild Dementia

In [75]:
# Define source folder
source_folder_vmild = source_folder + "/Very mild Dementia"

# Get list of filenames in the source folder
filenames_vmild = os.listdir(source_folder_vmild)

# Create a DataFrame with the filenames
df_filenames_vmild = pd.DataFrame({'filename': filenames_vmild})

# Display the DataFrame
print("DataFrame size:", df_filenames_vmild.shape)
df_filenames_vmild


DataFrame size: (13725, 1)


Unnamed: 0,filename
0,OAS1_0003_MR1_mpr-1_100.jpg
1,OAS1_0003_MR1_mpr-1_101.jpg
2,OAS1_0003_MR1_mpr-1_102.jpg
3,OAS1_0003_MR1_mpr-1_103.jpg
4,OAS1_0003_MR1_mpr-1_104.jpg
...,...
13720,OAS1_0380_MR1_mpr-4_156.jpg
13721,OAS1_0380_MR1_mpr-4_157.jpg
13722,OAS1_0380_MR1_mpr-4_158.jpg
13723,OAS1_0380_MR1_mpr-4_159.jpg


In [77]:
# Apply function to extract ID and create new column
df_filenames_vmild['id'] = df_filenames_vmild['filename'].apply(extract_id)

# Perform inner join
df_filenames_vmild_merged = pd.merge(df_filenames_vmild, df_vmild, on='id', how='inner')

# Display the DataFrame
print("DataFrame size:", df_filenames_vmild_merged.shape)
df_filenames_vmild_merged

DataFrame size: (13725, 4)


Unnamed: 0,filename,id,class,assign
0,OAS1_0003_MR1_mpr-1_100.jpg,3,Very Mild,Train
1,OAS1_0003_MR1_mpr-1_101.jpg,3,Very Mild,Train
2,OAS1_0003_MR1_mpr-1_102.jpg,3,Very Mild,Train
3,OAS1_0003_MR1_mpr-1_103.jpg,3,Very Mild,Train
4,OAS1_0003_MR1_mpr-1_104.jpg,3,Very Mild,Train
...,...,...,...,...
13720,OAS1_0380_MR1_mpr-4_156.jpg,380,Very Mild,Train
13721,OAS1_0380_MR1_mpr-4_157.jpg,380,Very Mild,Train
13722,OAS1_0380_MR1_mpr-4_158.jpg,380,Very Mild,Train
13723,OAS1_0380_MR1_mpr-4_159.jpg,380,Very Mild,Train


### No Dementia (healthy)

In [82]:
# Define source folder
source_folder_healthy = source_folder + "/Non Demented"

# Get list of filenames in the source folder
filenames_healthy = os.listdir(source_folder_healthy)

# Create a DataFrame with the filenames
df_filenames_healthy = pd.DataFrame({'filename': filenames_healthy})

# Display the DataFrame
print("DataFrame size:", df_filenames_healthy.shape)
df_filenames_healthy


DataFrame size: (67222, 1)


Unnamed: 0,filename
0,OAS1_0001_MR1_mpr-1_100.jpg
1,OAS1_0001_MR1_mpr-1_101.jpg
2,OAS1_0001_MR1_mpr-1_102.jpg
3,OAS1_0001_MR1_mpr-1_103.jpg
4,OAS1_0001_MR1_mpr-1_104.jpg
...,...
67217,OAS1_0381_MR1_mpr-4_156.jpg
67218,OAS1_0381_MR1_mpr-4_157.jpg
67219,OAS1_0381_MR1_mpr-4_158.jpg
67220,OAS1_0381_MR1_mpr-4_159.jpg


In [84]:
# Apply function to extract ID and create new column
df_filenames_healthy['id'] = df_filenames_healthy['filename'].apply(extract_id)

# Perform inner join
df_filenames_healthy_merged = pd.merge(df_filenames_healthy, df_non, on='id', how='inner')

# Display the DataFrame
print("DataFrame size:", df_filenames_healthy_merged.shape)
df_filenames_healthy_merged

DataFrame size: (67222, 4)


Unnamed: 0,filename,id,class,assign
0,OAS1_0001_MR1_mpr-1_100.jpg,1,Non,Train
1,OAS1_0001_MR1_mpr-1_101.jpg,1,Non,Train
2,OAS1_0001_MR1_mpr-1_102.jpg,1,Non,Train
3,OAS1_0001_MR1_mpr-1_103.jpg,1,Non,Train
4,OAS1_0001_MR1_mpr-1_104.jpg,1,Non,Train
...,...,...,...,...
67217,OAS1_0381_MR1_mpr-4_156.jpg,381,Non,Train
67218,OAS1_0381_MR1_mpr-4_157.jpg,381,Non,Train
67219,OAS1_0381_MR1_mpr-4_158.jpg,381,Non,Train
67220,OAS1_0381_MR1_mpr-4_159.jpg,381,Non,Train


In [85]:
# CHECK
# Calculate value counts of 'assign' column
assign_counts = df_filenames_healthy_merged['assign'].value_counts()

# Calculate percentage
assign_percentages = (assign_counts / assign_counts.sum()) * 100

# Display breakdown in percentage
print(assign_percentages)

df_filenames_healthy_merged.info()

Train         70.417423
Validation    19.782214
Test           9.800363
Name: assign, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 67222 entries, 0 to 67221
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  67222 non-null  object
 1   id        67222 non-null  int64 
 2   class     67222 non-null  object
 3   assign    67222 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.6+ MB


# Moving Images for the "Mild-Moderate" Class

Sample file name: OAS1_0351_MR1_mpr-4_134.jpg

```
# print(row['id'])
# print(row['filename'])
# print(row['assign'])

316
OAS1_0316_MR1_mpr-3_105.jpg
Validation
```

In [53]:
# Create the destination folders if they don't exist
label_mildmod_folder_train = train_folder + "/label_MildModerate"
label_mildmod_folder_val = validation_folder + "/label_MildModerate"
label_mildmod_folder_test = test_folder + "/label_MildModerate"

if not os.path.exists(label_mildmod_folder_train):
    os.makedirs(label_mildmod_folder_train)

if not os.path.exists(label_mildmod_folder_val):
    os.makedirs(label_mildmod_folder_val)

if not os.path.exists(label_mildmod_folder_test):
    os.makedirs(label_mildmod_folder)

In [73]:
for index, row in df_filenames_mildmod_merged.iterrows():

    image_name = "/" + row['filename']
    label = row['assign']    # Train, Validation, Test

    # Construct the source and destination paths
    source_img_path = source_folder_mildmod + image_name
    
    # Determine the destination folder based on the label
    if label == "Train":
        destination_folder = label_mildmod_folder_train
    elif label == "Validation":
        destination_folder = label_mildmod_folder_val
    elif label == "Test":
        destination_folder = label_mildmod_folder_test
    else:
        print(f"Invalid label {label} for image {image_name}. Skipping...")
        continue
    
    # destination_path = os.path.join(destination_folder, image_name)
    destination_img_path = destination_folder + image_name

    # Check if the image exists in the source folder
    if os.path.exists(source_img_path):
        # Move the image to the destination folder
        shutil.move(source_img_path, destination_img_path)
        # print(f"Moved {image_name} to {destination_img_path}.")
    else:
        print(f"{image_name} not found in {source_img_path}.")
    
print("Image organization based on labels complete.")


Image organization based on labels complete.


# Moving Images for the "Very Mild" Class``

In [74]:
# Create the destination folders if they don't exist
label_vmild_folder_train = train_folder + "/label_VeryMild"
label_vmild_folder_val = validation_folder + "/label_VeryMild"
label_vmild_folder_test = test_folder + "/label_VeryMild"

if not os.path.exists(label_vmild_folder_train):
    os.makedirs(label_vmild_folder_train)

if not os.path.exists(label_vmild_folder_val):
    os.makedirs(label_vmild_folder_val)

if not os.path.exists(label_vmild_folder_test):
    os.makedirs(label_vmild_folder_test)

In [80]:
for index, row in df_filenames_vmild_merged.iterrows():

    image_name = "/" + row['filename']
    label = row['assign']    # Train, Validation, Test

    # Construct the source and destination paths
    source_img_path = source_folder_vmild + image_name
    
    # Determine the destination folder based on the label
    if label == "Train":
        destination_folder = label_vmild_folder_train
    elif label == "Validation":
        destination_folder = label_vmild_folder_val
    elif label == "Test":
        destination_folder = label_vmild_folder_test
    else:
        print(f"Invalid label {label} for image {image_name}. Skipping...")
        continue
    
    # destination_path = os.path.join(destination_folder, image_name)
    destination_img_path = destination_folder + image_name

    # Check if the image exists in the source folder
    if os.path.exists(source_img_path):
        # Move the image to the destination folder
        shutil.move(source_img_path, destination_img_path)
        # print(f"Moved {image_name} to {destination_img_path}.")
    else:
        print(f"{image_name} not found in {source_img_path}.")
    
print("Image organization based on labels complete.")


Image organization based on labels complete.


# Moving Images for the "Non Demented" Class``

In [81]:
# Create the destination folders if they don't exist
label_healthy_folder_train = train_folder + "/label_Healthy"
label_healthy_folder_val = validation_folder + "/label_Healthy"
label_healthy_folder_test = test_folder + "/label_Healthy"

if not os.path.exists(label_healthy_folder_train):
    os.makedirs(label_healthy_folder_train)

if not os.path.exists(label_healthy_folder_val):
    os.makedirs(label_healthy_folder_val)

if not os.path.exists(label_healthy_folder_test):
    os.makedirs(label_healthy_folder_test)

In [86]:
for index, row in df_filenames_healthy_merged.iterrows():

    image_name = "/" + row['filename']
    label = row['assign']    # Train, Validation, Test

    # Construct the source and destination paths
    source_img_path = source_folder_healthy + image_name
    
    # Determine the destination folder based on the label
    if label == "Train":
        destination_folder = label_healthy_folder_train
    elif label == "Validation":
        destination_folder = label_healthy_folder_val
    elif label == "Test":
        destination_folder = label_healthy_folder_test
    else:
        print(f"Invalid label {label} for image {image_name}. Skipping...")
        continue
    
    # destination_path = os.path.join(destination_folder, image_name)
    destination_img_path = destination_folder + image_name

    # Check if the image exists in the source folder
    if os.path.exists(source_img_path):
        # Move the image to the destination folder
        shutil.move(source_img_path, destination_img_path)
        # print(f"Moved {image_name} to {destination_img_path}.")
    else:
        print(f"{image_name} not found in {source_img_path}.")
    
print("Image organization based on labels complete.")


Image organization based on labels complete.


# Export names of files to excel

In [2]:
import os
import csv

# Define the directory path
folder_path = r"C:\Users\Adison\Downloads\Data\Curated OASIS"

# List to store filenames
all_files = []

# Recursively walk through the directory and its subdirectories
for root, dirs, files in os.walk(folder_path):
    for file in files:
        # Get the full path of the file
        file_path = os.path.join(root, file)
        all_files.append(file_path)

# Define the path for the CSV file
csv_file_path = "curated_file_list.csv"

# Write filenames to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['File Path'])
    # Write filenames
    writer.writerows([[file] for file in all_files])

print("File list saved to:", csv_file_path)


File list saved to: curated_file_list.csv


# Delete Non-MPR1 Files

In [None]:
import pandas as pd
import os

# Read the CSV file containing the list of filepaths
file_list_csv = "curated_file_list_fordelete.csv"  # Change this to the actual CSV file path
file_df = pd.read_csv(file_list_csv)
file_df

Unnamed: 0,filepaths_fordeletion
0,C:/Users/Adison/Downloads/Data/Curated OASIS/T...
1,C:/Users/Adison/Downloads/Data/Curated OASIS/T...
2,C:/Users/Adison/Downloads/Data/Curated OASIS/T...
3,C:/Users/Adison/Downloads/Data/Curated OASIS/T...
4,C:/Users/Adison/Downloads/Data/Curated OASIS/T...
...,...
64167,C:/Users/Adison/Downloads/Data/Curated OASIS/V...
64168,C:/Users/Adison/Downloads/Data/Curated OASIS/V...
64169,C:/Users/Adison/Downloads/Data/Curated OASIS/V...
64170,C:/Users/Adison/Downloads/Data/Curated OASIS/V...


In [None]:
# Iterate through the filepaths and delete the corresponding files
for index, row in file_df.iterrows():
    filepath = row['filepaths_fordeletion']
    if os.path.exists(filepath):
        os.remove(filepath)
        # print(f"Deleted: {filepath}")
    else:
        print(f"File not found: {filepath}")

Export list of existing files

In [None]:
import os
import csv

# Define the directory path
folder_path = r"C:\Users\Adison\Downloads\Data\Curated OASIS"

# List to store filenames
all_files = []

# Recursively walk through the directory and its subdirectories
for root, dirs, files in os.walk(folder_path):
    for file in files:
        # Get the full path of the file
        file_path = os.path.join(root, file)
        all_files.append(file_path)

# Define the path for the CSV file
csv_file_path = "curated_file_list_short.csv"

# Write filenames to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['File Path'])
    # Write filenames
    writer.writerows([[file] for file in all_files])

print("File list saved to:", csv_file_path)


File list saved to: curated_file_list_short.csv
