# Data preprocessing
This notebook contains the functions to preprocess the various datasets for the age prediction task

## Abide

In [None]:
import pandas as pd
import requests
from io import StringIO
from tqdm import tqdm
import numpy as np

df = pd.read_csv("data/datasets/abide/Phenotypic_V1_0b_preprocessed1.csv")  

# Base URL
base_url = "https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/ants/roi_thickness/"

# Lists to collect data
data = []
index = []

# Download and parse each file in memory
for file_id in tqdm(df["FILE_ID"].unique(), desc="Downloading and parsing thickness"):
    url = f"{base_url}{file_id}_roi_thickness.txt"
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        # Read the file content into a pandas DataFrame (2nd row only)
        file_buffer = StringIO(response.text)
        file_df = pd.read_csv(file_buffer, sep="\t", skiprows=1, header=None)

        # Extract thickness values (starting from column 2)
        thickness_values = file_df.iloc[0, 2:].astype(float)
        data.append(thickness_values.values)
        index.append(file_id)

    except Exception as e:
        print(f"Failed to process {file_id}: {e}")

# Build the final DataFrame
thickness_df = pd.DataFrame(data, index=index)

# Set column names from the header of one sample file (use first successful response)
try:
    sample_url = f"{base_url}{index[0]}_roi_thickness.txt"
    sample_response = requests.get(sample_url)
    header = pd.read_csv(StringIO(sample_response.text), sep="\t", nrows=0).columns[2:]
    thickness_df.columns = header
except Exception as e:
    print(f"Could not extract column names: {e}")

# Result: thickness_df contains one row per subject, one column per region
print(thickness_df.shape)
print(thickness_df.head())


In [None]:
thickness_df.to_csv("data/datasets/abide/full_cortical_thickness.csv")

# Filter columns with name format 'Mean_[number]' where number > 1000
# This is based on this mapping: https://mindboggle.readthedocs.io/en/latest/labels.html
# we only care about those 31 regions in 2 sides
filtered_df = thickness_df.loc[:, [
    col for col in thickness_df.columns
    if col.startswith("Mean_") and int(col.split("_")[1]) > 1000
]]

# Show result
print(f"Filtered DataFrame shape: {filtered_df.shape}")
print(filtered_df.to_numpy().astype(float).shape)


In [145]:
# Make sure FILE_ID is set as index in original dataframe for merging
df_indexed = df.set_index("FILE_ID")

# Merge on the index (FILE_ID), keeping only those subjects with cortical thickness data
final_df = filtered_df.join(df_indexed["AGE_AT_SCAN"], how="inner")

final_df.to_csv("data/datasets/abide/cortical_thickness_age.csv")

y = final_df.to_numpy()[:,-1].astype(float)
X = final_df.to_numpy()[:,:-1].astype(float)
np.save('data/datasets/abide/X_age.npy', X)
np.save('data/datasets/abide/y_age.npy', y)

## ADNI

In [None]:
datadic_file = "ADSP_PHC_T1_FS_DATADIC_10Jun2025.csv"
data_file = "ADSP_PHC_T1_FS_10Jun2025.csv"

df_dic = pd.read_csv(f"data/datasets/adni/{datadic_file}")
df = pd.read_csv(f"data/datasets/adni/{data_file}")

phase_to_select = 'ADNI2'
label_column = "PHC_Age_T1"

# Load dataset
df = pd.read_csv("../data/datasets/adni/ADSP_PHC_T1_FS_10Jun2025.csv")
# Filter current phase
df_single_phase = df[df['PHASE'] == phase_to_select].copy()
# Keep only one row per patient corresponding to the earliest scan date
df_single_phase = df_single_phase.loc[df_single_phase.groupby('PTID')['PHC_SCANDATE'].idxmin()]

# remove rows with NaN in the label column
df_single_phase = df_single_phase.dropna(subset=[label_column])

# keep columns that end with '_thickness_combat' and remove the mean per hemisphere
all_columns = df.columns.tolist()
cortical_thickness_columns = [col for col in all_columns if col.endswith('_thickness_combat') and "MeanThickness" not in col]

# Define features and labels
X = df_single_phase[cortical_thickness_columns].to_numpy(dtype=float)
y = df_single_phase[label_column].to_numpy(dtype=float)

np.save(f"../data/datasets/adni/X_{phase_to_select.lower()}_age.npy", X)
np.save(f"../data/datasets/adni/y_{phase_to_select.lower()}_age.npy", y)
