# Organize Participant Information

This notebook merges demographic, clinical, and pathology data from multiple files to create a single `participants.tsv` file. This file is essential for subject-level analysis and for organizing the dataset according to BIDS standards.

In [29]:
import pandas as pd
import os

# Define file paths
dicom_akh_path = '../../metadata/AKH_nifti_637_dicom_header.csv'
histo_akh_path = '../../metadata/AKH_nifti_637_histo.xlsx'
dicom_neimeng_path = '../../metadata/Neimeng_nifti_425_dicom_header.csv'
histo_neimeng_path = '../../metadata/Neimeng_nifti_425_histo.xlsx'
output_path = '../../metadata/participants.tsv'

# Load the data
df_dicom_akh = pd.read_csv(dicom_akh_path)
df_histo_akh = pd.read_excel(histo_akh_path)
df_dicom_neimeng = pd.read_csv(dicom_neimeng_path)
df_dicom_neimeng['PID'] = df_dicom_neimeng['PID'].astype(str).str.zfill(3)

df_histo_neimeng = pd.read_excel(histo_neimeng_path, sheet_name=1)
df_histo_neimeng['PID'] = df_histo_neimeng['PID'].astype(str).str.zfill(3)

# --- Preprocess AKH data ---
# Rename 'Pathology' to 'pathology' for consistency
# df_histo_akh.rename(columns={'Pathology': 'pathology'}, inplace=True)
df_histo_akh = df_histo_akh[['PID', 'Pathology']]
df_akh = pd.merge(df_dicom_akh, df_histo_akh, on='PID', how='left')
df_akh['center'] = 'AKH'

# --- Preprocess Neimeng data ---
# Standardize PID format
# df_histo_neimeng.rename(columns={'Pathology': 'pathology'}, inplace=True)
# df_dicom_neimeng['PID'] = df_dicom_neimeng['PID'].astype(str).str.zfill(3)
df_histo_neimeng = df_histo_neimeng[['PID', 'Pathology']]
df_neimeng = pd.merge(df_dicom_neimeng, df_histo_neimeng, on='PID', how='left')
df_neimeng['center'] = 'Neimeng'

# --- Combine data ---
df_combined = pd.concat([df_akh, df_neimeng], ignore_index=True)

# --- Create subject_id ---
def convert_pid_to_subject(row):
    center = row['center']
    pid = row['PID']
    if center == 'AKH':
        # Ensure PID is treated as a string and remove any hyphens
        return f"AKH{str(pid).replace('-', '')}"
    elif center == 'Neimeng':
        # Ensure PID is treated as a string
        return f"NM{str(pid)}"
    return pid

df_combined['subject_id'] = df_combined.apply(convert_pid_to_subject, axis=1)

# --- Select and reorder columns ---
# Define the columns to keep, ensuring all required columns are present
final_columns = [
    'subject_id', 
    'center', 
    'PID', 
    'Pathology', 
    'SUV_ratio', 
    'Patient_name', 
    'Study_date', 
    'ManufacturerModelName', 
    'Dose', 
    'post_inj_time', 
    'Radiopharmaceutical', 
    'Radionuclide', 
    'gender', 
    'age', 
    'weight', 
    'height', 
    'BMI'
]

# Filter the DataFrame to only include the desired columns
df_final = df_combined[final_columns]

# Sort the DataFrame by subject_id for consistency and better readability
df_final = df_final.sort_values(by='subject_id').reset_index(drop=True)

# --- Save to TSV ---
df_final.to_csv(output_path, sep='\t', index=False, float_format='%.15f')

print(f"Successfully created participants.tsv with {len(df_final)} subjects.")
df_final.head()


Successfully created participants.tsv with 1061 subjects.


Unnamed: 0,subject_id,center,PID,Pathology,SUV_ratio,Patient_name,Study_date,ManufacturerModelName,Dose,post_inj_time,Radiopharmaceutical,Radionuclide,gender,age,weight,height,BMI
0,AKHABDALLAADELAHMED20091023,AKH,ABDALLA-ADEL-AHMED20091023,SCC,0.000325,ABDALLA^ADEL-AHMED,20091023,1094,253000000,63,,F^18^[^18^Fluorine],M,50.0,55.0,1.7,19.031142
1,AKHABTBRIGITTE20160818,AKH,ABT-BRIGITTE20160818,ADC,0.000353,ABT^Brigitte,20160818,Biograph64_TruePoint,282000000,62,Fluorodeoxyglucose,^18^Fluorine,F,57.0,67.0,1.65,24.609734
2,AKHADAMEKKARIN20211124,AKH,ADAMEK-KARIN20211124,ADC,0.000534,ADAMEK^Karin,20211124,Biograph128_Vision 600 Edge,275000000,84,Fluorodeoxyglucose,^18^Fluorine,F,58.0,86.0,1.7,29.757785
3,AKHAHAMERGERHARD20240213,AKH,AHAMER-GERHARD20240213,LCNEC,0.000448,Ahamer^Gerhard,20240213,Biograph128_Vision 600 Edge,285610000,56,Fluorodeoxyglucose,^18^Fluorine,M,77.0,90.0,1.85,26.296567
4,AKHAHMEDINAZIF20230111,AKH,AHMEDI-NAZIF20230111,SCC,0.000578,AHMEDI^Nazif,20230111,Biograph128_Vision Quadra Edge,194000000,93,Fluorodeoxyglucose,^18^Fluorine,M,66.0,62.0,1.63,23.335466
