# Organize Participant Information

This notebook merges demographic, clinical, and pathology data from multiple files to create a single `participants.tsv` file. This file is essential for subject-level analysis and for organizing the dataset according to BIDS standards.

In [None]:
import pandas as pd
import os

# Define file paths
project_root = '../..'
dicom_akh_path = f'{project_root}/metadata/AKH_nifti_637_dicom_header.csv'
histo_akh_path = f'{project_root}/metadata/AKH_nifti_637_histo.xlsx'
dicom_neimeng_path = f'{project_root}/metadata/Neimeng_nifti_425_dicom_header.csv'
histo_neimeng_path = f'{project_root}/metadata/Neimeng_nifti_425_histo.xlsx'
output_path = f'{project_root}/metadata/participants.tsv'

# Load the data
df_dicom_akh = pd.read_csv(dicom_akh_path)
df_histo_akh = pd.read_excel(histo_akh_path)
df_dicom_neimeng = pd.read_csv(dicom_neimeng_path)
df_dicom_neimeng['PID'] = df_dicom_neimeng['PID'].astype(str).str.zfill(3)

df_histo_neimeng = pd.read_excel(histo_neimeng_path, sheet_name=1)
df_histo_neimeng['PID'] = df_histo_neimeng['PID'].astype(str).str.zfill(3)

# --- Preprocess AKH data ---
# Rename 'Pathology' to 'pathology' for consistency
# df_histo_akh.rename(columns={'Pathology': 'pathology'}, inplace=True)
df_histo_akh = df_histo_akh[['PID', 'Pathology']]
df_akh = pd.merge(df_dicom_akh, df_histo_akh, on='PID', how='left')
df_akh['center'] = 'AKH'

# --- Preprocess Neimeng data ---
# Standardize PID format
# df_histo_neimeng.rename(columns={'Pathology': 'pathology'}, inplace=True)
# df_dicom_neimeng['PID'] = df_dicom_neimeng['PID'].astype(str).str.zfill(3)
df_histo_neimeng = df_histo_neimeng[['PID', 'Pathology']]
df_neimeng = pd.merge(df_dicom_neimeng, df_histo_neimeng, on='PID', how='left')
df_neimeng['center'] = 'Neimeng'

# --- Combine data ---
df_combined = pd.concat([df_akh, df_neimeng], ignore_index=True)

# --- Create subject_id ---
def convert_pid_to_subject(row):
    center = row['center']
    pid = row['PID']
    if center == 'AKH':
        # Ensure PID is treated as a string and remove any hyphens
        return f"AKH{str(pid).replace('-', '')}"
    elif center == 'Neimeng':
        # Ensure PID is treated as a string
        return f"NM{str(pid)}"
    return pid

df_combined['subject_id'] = df_combined.apply(convert_pid_to_subject, axis=1)

# --- Select and reorder columns ---
# Define the columns to keep, ensuring all required columns are present
final_columns = [
    'subject_id', 
    'center', 
    'PID', 
    'Pathology', 
    'SUV_ratio', 
    'Patient_name', 
    'Study_date', 
    'ManufacturerModelName', 
    'Dose', 
    'post_inj_time', 
    'Radiopharmaceutical', 
    'Radionuclide', 
    'gender', 
    'age', 
    'weight', 
    'height', 
    'BMI'
]

# Filter the DataFrame to only include the desired columns
df_final = df_combined[final_columns]

# Sort the DataFrame by subject_id for consistency and better readability
df_final = df_final.sort_values(by='subject_id').reset_index(drop=True)

# --- Save to TSV ---
df_final.to_csv(output_path, sep='\t', index=False, float_format='%.15f')

print(f"Successfully created participants.tsv with {len(df_final)} subjects.")
df_final.head()


# Statistical Analysis of Participants

This section performs a statistical analysis on the generated `participants.tsv` file. It focuses on patients diagnosed with Adenocarcinoma (ADC) and Squamous Cell Carcinoma (SCC), providing a breakdown of key metrics for each medical center.

The analysis includes:
- **Gender Distribution**: Counts of male and female patients.
- **Age Statistics**: Mean and standard deviation of patient ages.
- **Weight Statistics**: Mean and standard deviation of patient weights.

The data is loaded directly from the `participants.tsv` file to ensure the analysis is independent of the data generation process above.


In [8]:

import pandas as pd
import os

# --- Verification Step ---

# 1. Get subject IDs from the 'participants.tsv' file
participants_path = '../../metadata/participants.tsv'
df_participants = pd.read_csv(participants_path, sep='\t')
tsv_subjects = set(df_participants['subject_id'])

# 2. Get subject IDs from the 'data/0_nifti' directory
nifti_dir = '../../data/0_nifti'
# List directories and remove the 'sub-' prefix
dir_subjects = set([d.replace('sub-', '') for d in os.listdir(nifti_dir) if os.path.isdir(os.path.join(nifti_dir, d)) and d.startswith('sub-')])

# 3. Find the differences
tsv_only = tsv_subjects - dir_subjects
dir_only = dir_subjects - tsv_subjects

# 4. Report the findings
print("--- Data Consistency Check ---")

if tsv_only:
    print(f"\\nFound {len(tsv_only)} subjects in 'participants.tsv' that are NOT in the '0_nifti' directory.")
    # Print first 5 as examples if the list is long
    if len(tsv_only) > 10:
        print("Examples:", sorted(list(tsv_only))[:5])
    else:
        print(sorted(list(tsv_only)))
else:
    print("\\nAll subjects in 'participants.tsv' are present in the '0_nifti' directory.")

if dir_only:
    print(f"\\nFound {len(dir_only)} subjects in the '0_nifti' directory that are NOT in 'participants.tsv'.")
    print("List:", sorted(list(dir_only)))
else:
    print("\\nAll subjects in the '0_nifti' directory are present in 'participants.tsv'.")

if not tsv_only and not dir_only:
    print("\\nPerfect match! The participant list and data directory are consistent.")
else:
    # Suggestion for the user
    print("\\n\\nSuggestion: Review the data generation script in Cell 3 to understand why these discrepancies exist.")
    print("The subject IDs 'sub-AKH...' seem to have issues during the 'convert_pid_to_subject' conversion.")

print("\\n" + "="*40 + "\\n")


--- Data Consistency Check ---
\nFound 28 subjects in 'participants.tsv' that are NOT in the '0_nifti' directory.
Examples: ['AKHAHAMERGERHARD20240213', 'AKHBELLINILAURADR20221020', 'AKHCZERNYJOHANN20190801', 'AKHERISTAVIDAVID20091022', 'AKHGEBAUERGABRIELE20100929']
\nAll subjects in the '0_nifti' directory are present in 'participants.tsv'.
\n\nSuggestion: Review the data generation script in Cell 3 to understand why these discrepancies exist.
The subject IDs 'sub-AKH...' seem to have issues during the 'convert_pid_to_subject' conversion.


# Verify Data Consistency

This section checks for consistency between the `participants.tsv` file and the subject directories in `data/0_nifti`. It identifies subjects that are present in one but not the other, which is crucial for ensuring data integrity.


In [9]:

import pandas as pd
import numpy as np

# Load the dataset directly from the .tsv file
file_path = '../../metadata/participants.tsv'
df = pd.read_csv(file_path, sep='\t')

# Filter for ADC and SCC pathologies
df_filtered = df[df['Pathology'].isin(['ADC', 'SCC'])].copy()

# Convert 'age' and 'weight' to numeric, coercing errors to NaN
df_filtered['age'] = pd.to_numeric(df_filtered['age'], errors='coerce')
df_filtered['weight'] = pd.to_numeric(df_filtered['weight'], errors='coerce')

# Get unique centers
centers = df_filtered['center'].unique()

for center in centers:
    print(f"--- Statistics for Center: {center} ---")
    df_center = df_filtered[df_filtered['center'] == center]
    
    # 1. Pathology distribution
    pathology_counts = df_center['Pathology'].value_counts()
    print("**Pathology Distribution:**")
    for pathology, count in pathology_counts.items():
        print(f"- {pathology}: {count}")
    print("\\n")

    # 2. Gender distribution
    gender_counts = df_center['gender'].value_counts()
    print("**Gender Distribution:**")
    for gender, count in gender_counts.items():
        print(f"- {gender}: {count}")
    print("\\n")
    
    # 3. Age statistics
    age_mean = df_center['age'].mean()
    age_std = df_center['age'].std()
    print("**Age Statistics:**")
    print(f"- Mean: {age_mean:.2f}")
    print(f"- Std Dev: {age_std:.2f}")
    print("\\n")

    # 4. Weight statistics
    weight_mean = df_center['weight'].mean()
    weight_std = df_center['weight'].std()
    print("**Weight Statistics:**")
    print(f"- Mean: {weight_mean:.2f}")
    print(f"- Std Dev: {weight_std:.2f}")
    print("\\n" + "="*40 + "\\n")


--- Statistics for Center: AKH ---
**Pathology Distribution:**
- ADC: 341
- SCC: 199
\n
**Gender Distribution:**
- M: 288
- F: 252
\n
**Age Statistics:**
- Mean: 65.60
- Std Dev: 10.05
\n
**Weight Statistics:**
- Mean: 73.62
- Std Dev: 17.20
--- Statistics for Center: Neimeng ---
**Pathology Distribution:**
- ADC: 239
- SCC: 135
\n
**Gender Distribution:**
- M: 268
- F: 106
\n
**Age Statistics:**
- Mean: 63.28
- Std Dev: 9.16
\n
**Weight Statistics:**
- Mean: 64.82
- Std Dev: 11.02
