## II/ Assesments Data

#### Imports

In [4]:
import pandas as pd
import numpy as np

import os
import shutil

from sklearn.impute import SimpleImputer
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

#### load and merge data

In [None]:
csv_folder = "Dataset/CSV/Assesments"

# Find all CSV files in that folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

# Columns to drop (repeated meta columns)
useless_cols = ['REC_ID', 'PAG_NAME', 'INFODT', 'ORIG_ENTRY', 'LAST_UPDATE']


dfs = []
for file in csv_files:
    df = pd.read_csv(os.path.join(csv_folder, file), dtype=str, low_memory=False)

    df.drop(columns=[col for col in useless_cols if col in df.columns], inplace=True)

    dfs.append(df)


# Merge all DataFrames on 'patient ID' and 'event ID'
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on=['PATNO', 'EVENT_ID'], how='inner')


# Drop columns ending with _x or _y (duplicates)
cols_to_drop = [
    col for col in merged_df.columns
    if (col.endswith('_x') or col.endswith('_y'))
]

merged_df.drop(columns=cols_to_drop, inplace=True)


print(f"Merged Shape: {merged_df.shape}\n")
print(f"Merged Columns: {merged_df.columns}\n")


Merged Shape: (10213, 162)

Merged Columns: Index(['PATNO', 'EVENT_ID', 'ESS1', 'ESS2', 'ESS3', 'ESS4', 'ESS5', 'ESS6',
       'ESS7', 'ESS8',
       ...
       'SCAU24', 'SCAU25', 'SCAU26A', 'SCAU26AT', 'SCAU26B', 'SCAU26BT',
       'SCAU26C', 'SCAU26CT', 'SCAU26D', 'SCAU26DT'],
      dtype='object', length=162)



#### head

In [271]:
merged_df.head()

Unnamed: 0,PATNO,EVENT_ID,ESS1,ESS2,ESS3,ESS4,ESS5,ESS6,ESS7,ESS8,...,SCAU24,SCAU25,SCAU26A,SCAU26AT,SCAU26B,SCAU26BT,SCAU26C,SCAU26CT,SCAU26D,SCAU26DT
0,3001,V10,1,2,1,0,2,0,0,0,...,,,0,,1,vesicare,1,lisinopril,1,"finacea,metronidazole,clelopirox dlaminde"
1,3001,V10,1,2,1,0,2,0,0,0,...,,,0,,1,vesicare,1,lisinopril,1,"finacea,metronidazole,clelopirox dlaminde"
2,3001,V12,1,2,1,0,2,0,0,0,...,,,1,"miralax,colace",1,vesicare,1,lisinopril,1,"lipitor,isradipine,lamisil,"
3,3001,V12,1,2,1,0,2,0,0,0,...,,,1,"miralax,colace",1,vesicare,1,lisinopril,1,"lipitor,isradipine,lamisil,"
4,3001,V14,1,2,0,0,2,0,1,0,...,,,1,colace,1,trospium chloride er,1,lisinopril,1,"isradipine (Raynaud's), atorvastatin (hypertip..."


#### info

In [20]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10213 entries, 0 to 10212
Columns: 162 entries, PATNO to SCAU26DT
dtypes: object(162)
memory usage: 12.6+ MB


#### describe

In [273]:
merged_df.describe(include="all")

Unnamed: 0,PATNO,EVENT_ID,ESS1,ESS2,ESS3,ESS4,ESS5,ESS6,ESS7,ESS8,...,SCAU24,SCAU25,SCAU26A,SCAU26AT,SCAU26B,SCAU26BT,SCAU26C,SCAU26CT,SCAU26D,SCAU26DT
count,10213,10213,10213,10210,10211,10213,10213,10211,10209,10200,...,5681,5674,10208,2987,10208,1886,10207,3510,10194,3708
unique,1335,23,4,4,4,4,4,4,4,4,...,5,5,2,785,2,567,2,1107,2,1850
top,3400,V04,1,1,0,0,3,0,0,0,...,9,9,0,Miralax,0,Tamsulosin,0,Losartan,0,synthroid
freq,27,1515,4366,4159,5717,4080,3817,8519,4320,8531,...,3377,3365,7183,328,8314,149,6675,148,6484,24


#### nunique

In [274]:
merged_df.nunique().sort_values()

ONOFFORDER       2
OFFEXAM          2
DBSOFFYN         2
ONEXAM           2
DBSYN            2
              ... 
SCAU26AT       785
HRPOSTMED     1051
SCAU26CT      1107
PATNO         1335
SCAU26DT      1850
Length: 162, dtype: int64

### Order By Patiend ID

In [71]:
is_sorted = merged_df['PATNO'].is_monotonic_increasing
print(f"Is PATNO sorted? {is_sorted}")

Is PATNO sorted? False


In [72]:
merged_df.sort_values(by='PATNO', inplace=True)

### HOEHN and YAHR Stage

In [73]:
print(f"Merged Shape: {merged_df.shape}\n")

print(merged_df['NHY'].value_counts(dropna=False).sort_index())

Merged Shape: (10213, 162)

NHY
0       162
1      1380
101      55
2      6760
3       734
4       152
5        62
NaN     908
Name: count, dtype: int64


In [74]:
# Drop rows where NHY is NaN or '101'
merged_df = merged_df[~(merged_df['NHY'].isna() | (merged_df['NHY'] == '101'))]

# Merge '5' and '4' into one stage
merged_df['NHY'] = merged_df['NHY'].replace('5', '4')

In [75]:
print(f"Merged Shape: {merged_df.shape}\n")

print(merged_df['NHY'].value_counts(dropna=False).sort_index())

Merged Shape: (9250, 162)

NHY
0     162
1    1380
2    6760
3     734
4     214
Name: count, dtype: int64


### Patient's Functional State

In [76]:
print(f"Merged Shape: {merged_df.shape}\n")

print(merged_df['PDSTATE'].value_counts(dropna=False))

Merged Shape: (9250, 162)

PDSTATE
ON     5153
OFF    3676
NaN     421
Name: count, dtype: int64


In [77]:
merged_df = merged_df[~(merged_df['PDSTATE'] == 'ON')]

In [78]:
print(f"Merged Shape: {merged_df.shape}\n")

print(merged_df['PDSTATE'].value_counts(dropna=False))

Merged Shape: (4097, 162)

PDSTATE
OFF    3676
NaN     421
Name: count, dtype: int64


### 1. Handle Duplicates

In [79]:
print(f"BEFORE: Number of duplicate rows: {merged_df.duplicated().sum()}")

merged_df.drop_duplicates(inplace=True)
print(f"AFTER: Number of duplicate rows: {merged_df.duplicated().sum()}")

BEFORE: Number of duplicate rows: 0
AFTER: Number of duplicate rows: 0


### 2. Drop Useless/Sparse Columns

In [80]:
missing_values = merged_df.isnull().sum()
missing_percent = (missing_values / len(merged_df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage (%)': missing_percent
})

missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values(by='Percentage (%)', ascending=False)
missing_df

Unnamed: 0,Missing Values,Percentage (%)
HRDBSON,4097,100.000000
ONEXAM,4097,100.000000
ONNORSN,4097,100.000000
DBSOFFYN,4094,99.926776
OFFNORSN,4094,99.926776
...,...,...
SCAU3,1,0.024408
SCAU1,1,0.024408
SCAU4,1,0.024408
SCAU26B,1,0.024408


In [81]:
useless_cols = [
    'DBSOFFTM',
    'DBSONTM',
    'PTCGBOTH',
    'PDMEDDT',       # Date of medication start
    'PDMEDTM',       # Time of medication start
    'EXAMTM',        # Time of motor exam
]
print(f"Number of usless columns: {len(useless_cols)}")

sparse_cols = missing_df[missing_df['Percentage (%)'] > 55].index.tolist()
print(f"Number of sparse columns: {len(sparse_cols)}")

merged_df.drop(columns=sparse_cols + useless_cols, inplace=True)
print(f"New Shape: {merged_df.shape}\n")

Number of usless columns: 6
Number of sparse columns: 27
New Shape: (4097, 131)



### 3. Impute missing values

In [82]:
# Separate numerical and categorical
numerical_cols = merged_df.select_dtypes(include='number').columns
categorical_cols = merged_df.select_dtypes(include='object').columns
print(f"Number of numerical columns: {len(numerical_cols)}")
print(f"Number of categorical columns: {len(categorical_cols)}\n")

# Print missing values before imputation
missing_before = merged_df[categorical_cols].isna().sum().sum()
print(f"Missing values in categorical columns before imputation: {missing_before}")

# Imputers
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Apply
merged_df[categorical_cols] = categorical_imputer.fit_transform(merged_df[categorical_cols])

# Print missing values after imputation
missing_after = merged_df[categorical_cols].isna().sum().sum()
print(f"Missing values in categorical columns after imputation: {missing_after}")

Number of numerical columns: 0
Number of categorical columns: 131

Missing values in categorical columns before imputation: 8958
Missing values in categorical columns after imputation: 0


### 4. Handle date columns

In [83]:
parsed_date_columns = []

for col in merged_df.columns:
    if merged_df[col].dtype == 'object':
        try:
            merged_df[col] = pd.to_datetime(merged_df[col], format='%Y-%m', errors='raise')
            parsed_date_columns.append(col)
        except:
            try:
                # Fallback to dateutil
                merged_df[col] = merged_df[col].apply(parse)
                parsed_date_columns.append(col)
            except:
                continue

print(f"Successfully parsed {len(parsed_date_columns)} columns as datetime:")
print(parsed_date_columns)


Successfully parsed 1 columns as datetime:
['EXAMDT']


In [84]:
# Find all datetime columns
datetime_cols = merged_df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns
print(f"Number of datetime columns: {len(datetime_cols)}\n")


# Create new columns in a separate DataFrame
new_cols = {}

for col in datetime_cols:
    new_cols[f'{col}_YEAR'] = merged_df[col].dt.year
    new_cols[f'{col}_MONTH'] = merged_df[col].dt.month

# Concatenate new columns all at once
merged_df = pd.concat([merged_df, pd.DataFrame(new_cols, index=merged_df.index)], axis=1)

# Drop the original datetime columns
merged_df.drop(columns=datetime_cols, inplace=True)


Number of datetime columns: 1



### 5. Encode categorical columns

In [85]:
# Columns to encode
object_cols = merged_df.select_dtypes(include='object').columns.drop(['PATNO', 'EVENT_ID', 'NHY'], errors='ignore')

print(f"Number of object columns to encode: {len(object_cols)}")

# Fit and transform with OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
merged_df[object_cols] = encoder.fit_transform(merged_df[object_cols].astype(str))

merged_df.head()

Number of object columns to encode: 127


Unnamed: 0,PATNO,EVENT_ID,ESS1,ESS2,ESS3,ESS4,ESS5,ESS6,ESS7,ESS8,...,SCAU23,SCAU23A,SCAU24,SCAU25,SCAU26A,SCAU26B,SCAU26C,SCAU26D,EXAMDT_YEAR,EXAMDT_MONTH
8384,100005,V04,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,2022,2
8385,100005,V08,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,2024,5
8387,100006,V04,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,...,4.0,0.0,4.0,4.0,0.0,0.0,0.0,1.0,2021,12
8390,100006,V06,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2023,3
8392,100006,V10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,4.0,0.0,0.0,1.0,0.0,2025,1


### 6. Normalize

In [86]:
# Identify all numeric columns
num_cols = merged_df.select_dtypes(include=['float64', 'int64', 'int32']).columns.tolist()
# Exclude PATNO and NHY from normalization
num_cols = [col for col in num_cols if col not in ['PATNO', 'EVENT_ID', 'NHY']]
print(f"Number of numerical columns to normalize: {len(num_cols)}")

# Fit and transform the data
merged_df[num_cols] = StandardScaler().fit_transform(merged_df[num_cols])

merged_df.head()


Number of numerical columns to normalize: 129


Unnamed: 0,PATNO,EVENT_ID,ESS1,ESS2,ESS3,ESS4,ESS5,ESS6,ESS7,ESS8,...,SCAU23,SCAU23A,SCAU24,SCAU25,SCAU26A,SCAU26B,SCAU26C,SCAU26D,EXAMDT_YEAR,EXAMDT_MONTH
8384,100005,V04,-1.315982,-1.466021,-0.735445,0.087481,-0.93008,-0.387542,-0.929043,-0.375284,...,-0.061244,-0.351268,0.503311,0.508357,-0.635266,-0.474145,-0.722829,-0.748047,0.824657,-1.291969
8385,100005,V08,-1.315982,-1.466021,-0.735445,-0.963577,-0.93008,-0.387542,-0.929043,-0.375284,...,0.588803,2.846831,0.503311,0.508357,-0.635266,-0.474145,-0.722829,-0.748047,1.36167,-0.39689
8387,100006,V04,-1.315982,-0.380874,-0.735445,0.087481,0.076135,-0.387542,-0.929043,-0.375284,...,1.888897,-0.351268,0.503311,0.508357,-0.635266,-0.474145,-0.722829,1.336815,0.55615,1.691627
8390,100006,V06,0.826509,-1.466021,-0.735445,-0.963577,-0.93008,-0.387542,-0.929043,-0.375284,...,1.888897,-0.351268,-2.335045,-2.340806,-0.635266,-0.474145,1.383452,-0.748047,1.093163,-0.993609
8392,100006,V10,-1.315982,-1.466021,-0.735445,-0.963577,-1.936295,-0.387542,-0.929043,-0.375284,...,1.888897,-0.351268,0.503311,0.508357,-0.635266,-0.474145,1.383452,-0.748047,1.630176,-1.590328


### 7. compute correlation

In [87]:
# Compute correlation matrix (absolute values)
corr_matrix = merged_df.corr(numeric_only=True).abs()

# Create a boolean mask for the upper triangle (to avoid double-checking pairs)
upper_tri = np.triu(corr_matrix, k=1)

# Find column pairs with correlation > 0.5
to_drop = [column for column in corr_matrix.columns if any(upper_tri[:, corr_matrix.columns.get_loc(column)] > 0.5)]

print(f"Original shape: {merged_df.shape}")

# Drop those columns from the dataframe
merged_df.drop(columns=to_drop, inplace=True)

print(f"Reduced shape: {merged_df.shape}")

print("Dropped columns due to high correlation:")
print(to_drop)


Original shape: (4097, 132)
Reduced shape: (4097, 84)
Dropped columns due to high correlation:
['ESS2', 'ESS7', 'NP1ANXS', 'NP1RTOT', 'PDMEDYN', 'NP3FACXP', 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3GAIT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3KTRML', 'NP3RTARU', 'NP3RTCON', 'NP4DYSKI', 'NP4FLCTI', 'NP4FLCTX', 'NP4TOT', 'NP2DRES', 'NP2HYGN', 'NP2HOBB', 'NP2TURN', 'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ', 'DRMFIGHT', 'DRMUMV', 'SCAU1', 'SCAU2', 'SCAU3', 'SCAU6', 'SCAU9', 'SCAU11', 'SCAU15', 'SCAU18', 'SCAU23', 'SCAU25']


### 8. Export to a csv file

In [88]:
# Move 'NHY' to the end
nhy_col = merged_df.pop('NHY')
merged_df['NHY'] = nhy_col

In [None]:
output_path = "Dataset/CSV/assesments_data.csv"
merged_df.to_csv(output_path, index=False)

In [90]:
print(merged_df.shape)
print(merged_df.columns)

(4097, 84)
Index(['PATNO', 'EVENT_ID', 'ESS1', 'ESS3', 'ESS4', 'ESS5', 'ESS6', 'ESS8',
       'NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1APAT', 'NP1DDS', 'PDTRTMNT',
       'PDSTATE', 'HRPOSTMED', 'DBSYN', 'NP3SPCH', 'NP3RIGN', 'NP3RIGRU',
       'NP3RIGLU', 'NP3FTAPR', 'NP3RISNG', 'NP3FRZGT', 'NP3PTRMR', 'NP3PTRML',
       'NP3KTRMR', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3TOT',
       'DYSKPRES', 'NP4WDYSK', 'NP4OFF', 'NP4DYSTN', 'NP2SPCH', 'NP2SALV',
       'NP2SWAL', 'NP2EAT', 'NP2HWRT', 'NP2PTOT', 'DRMVIVID', 'DRMAGRAC',
       'DRMNOCTB', 'SLPLMBMV', 'SLPINJUR', 'DRMVERBL', 'DRMOBJFL', 'MVAWAKEN',
       'DRMREMEM', 'SLPDSTRB', 'STROKE', 'HETRA', 'PARKISM', 'RLS', 'NARCLPSY',
       'DEPRS', 'EPILEPSY', 'BRNINFM', 'CNSOTH', 'SCAU4', 'SCAU5', 'SCAU7',
       'SCAU8', 'SCAU10', 'SCAU12', 'SCAU13', 'SCAU14', 'SCAU16', 'SCAU17',
       'SCAU19', 'SCAU20', 'SCAU21', 'SCAU22', 'SCAU23A', 'SCAU24', 'SCAU26A',
       'SCAU26B', 'SCAU26C', 'SCAU26D', 'EXAMDT_YEAR', 'EXAMDT_MONTH', '

## III/ Merge Both Data Types

In [None]:
# Load both CSV files
assessments_df = pd.read_csv("Dataset/CSV/assesments_data.csv")
DaTscan_df = pd.read_csv("Dataset/Images/DaTscan_dataset.csv")


# Remove corrupted image entries
corrupted_entries = [
    # failed in Conversion
    {"Image Data ID": "I770243",  "Subject": 3107},
    {"Image Data ID": "I1317564", "Subject": 3179},
    # failed in Denoising
    {"Image Data ID": "I1317554", "Subject": 3372},
    {"Image Data ID": "I449046",  "Subject": 4092},
    {"Image Data ID": "I1273880",  "Subject": 50028},
    # failed in cropping
    {"Image Data ID": "I1317553",  "Subject": 3078},
    {"Image Data ID": "I1317557",  "Subject": 3378}
]

for entry in corrupted_entries:
    DaTscan_df = DaTscan_df[~(
        (DaTscan_df["Image Data ID"] == entry["Image Data ID"]) &
        (DaTscan_df["Subject"] == entry["Subject"])
    )]


# Rename columns in DaTscan to match those in assessments
DaTscan_df = DaTscan_df.rename(columns={"Subject": "PATNO", "Visit": "EVENT_ID"})


# Merge on both keys, keeping only matches in both dataframes
merged_df = pd.merge(
    assessments_df,
    DaTscan_df[["PATNO", "EVENT_ID", "Image Data ID"]],
    on=["PATNO", "EVENT_ID"],
    how="inner"
)


# Move "Image Data ID" to be 3rd column
if "Image Data ID" in merged_df.columns:
    cols = merged_df.columns.tolist()
    cols.remove("Image Data ID")
    cols.insert(2, "Image Data ID")
    merged_df = merged_df[cols]


# Save the merged dataset
merged_df.to_csv("Dataset/CSV/assesments_data.csv", index=False)

print(f"Original dataset shape: {(4097, 84)}")
print(f"Merged dataset shape: {merged_df.shape}") # (986, 85)

Original dataset shape: (4097, 84)
Merged dataset shape: (986, 85)


In [None]:
# Paths
merged_csv_path = "Dataset/CSV/assesments_data.csv"
images_dir = "Dataset/Images/Cropped_NIfTI"

# Load merged dataset
merged_df = pd.read_csv(merged_csv_path)


# Create mapping: patient -> set of allowed image IDs
patient_to_images = (
    merged_df.groupby("PATNO")["Image Data ID"]
    .apply(lambda x: set(x.astype(str))) # convert image_id to string for comparaison
    .to_dict()
)

removed_patients_count = 0
removed_images_count = 0

# Go through each patient folder
for patient_folder in os.listdir(images_dir):
    patient_path = os.path.join(images_dir, patient_folder)
    if not os.path.isdir(patient_path):
        continue  # Skip if not a folder

    patient_id = str(patient_folder)

    # Case 1: Patient not in CSV → delete whole folder
    if patient_id not in map(str, patient_to_images.keys()):
        shutil.rmtree(patient_path)
        removed_patients_count += 1
        print(f"🗑 Deleted entire folder for patient {patient_folder}")
        continue
    
    # Case 2: Patient exists → check their images
    allowed_images = patient_to_images[int(patient_id)]

    for file_name in os.listdir(patient_path):
        if not file_name.endswith(".nii.gz"):
            continue

        image_id = file_name.replace(".nii.gz", "") # remove .nii.gz

        if image_id not in allowed_images:
            os.remove(os.path.join(patient_path, file_name))
            removed_images_count += 1
            print(f"🗑 Removed image {image_id} from patient {patient_folder}")


# Rename folder after cleanup
os.rename(images_dir, "Dataset/Processed_NIfTI")

# ==== FINAL SUMMARY ====
print("\n" + "="*60)
print(f"✅ Cropped images folder synced with merged dataset")
print(f"   Deleted {removed_patients_count} entire patient folders")
print(f"   Deleted {removed_images_count} individual images")


🗑 Deleted entire folder for patient 100001
🗑 Deleted entire folder for patient 100002
🗑 Deleted entire folder for patient 100004
🗑 Removed image I1461543 from patient 100005
🗑 Removed image I1461544 from patient 100006
🗑 Removed image I1461545 from patient 100007
🗑 Removed image I1461546 from patient 100012
🗑 Removed image I1474775 from patient 100017
🗑 Removed image I1619847 from patient 100017
🗑 Removed image I1461547 from patient 100018
🗑 Removed image I10380475 from patient 100267
🗑 Removed image I1461548 from patient 100267
🗑 Removed image I1461549 from patient 100268
🗑 Removed image I1616053 from patient 100268
🗑 Deleted entire folder for patient 100639
🗑 Deleted entire folder for patient 100738
🗑 Removed image I1461551 from patient 100842
🗑 Removed image I1573147 from patient 100842
🗑 Removed image I10380489 from patient 100878
🗑 Removed image I1461552 from patient 100878
🗑 Removed image I1461553 from patient 100889
🗑 Removed image I1582688 from patient 100889
🗑 Deleted entire f

### Verification

In [None]:
# Paths
csv_path = "Dataset/CSV/assesments_data.csv"
images_root = "Dataset/Images/Processed_NIfTI"

# --- Get patient IDs from CSV ---
df = pd.read_csv(csv_path)
csv_patients = set(df['PATNO'].astype(str).unique())

# --- Get patient IDs from image folders ---
folder_patients = set(os.listdir(images_root))

# --- Compare ---
only_in_csv = csv_patients - folder_patients
only_in_folder = folder_patients - csv_patients
both = csv_patients & folder_patients

print("Patients only in CSV:", len(only_in_csv))
print("Patients only in Folder:", len(only_in_folder))

if only_in_csv:
    print("Missing in images:", only_in_csv)
if only_in_folder:
    print("Missing in CSV:", only_in_folder)

# --- Count images for patients in both ---
total_images = 0
for patient_id in both:
    patient_folder = os.path.join(images_root, patient_id)
    total_images += len([f for f in os.listdir(patient_folder) if f.endswith(".nii") or f.endswith(".nii.gz")])


print("\nSummary:")
print(f"Total patients in CSV: {len(csv_patients)}")
print(f"Total patients in Folder: {len(folder_patients)}")
print(f"Total patients in both: {len(both)}")


Patients only in CSV: 0
Patients only in Folder: 0

Summary:
Total patients in CSV: 642
Total patients in Folder: 642
Total patients in both: 642


In [None]:
# Paths
csv_path = "Dataset/CSV/assesments_data.csv"
images_root = "Dataset/Images/Processed_NIfTI"

# --- Get image IDs from CSV ---
df = pd.read_csv(csv_path)
csv_images = set(df['Image Data ID'].astype(str).unique())

# --- Get image IDs from folder ---
folder_images = set()
for patient_folder in os.listdir(images_root):
    patient_path = os.path.join(images_root, patient_folder)
    if os.path.isdir(patient_path):
        for file_name in os.listdir(patient_path):
            if file_name.endswith(".nii") or file_name.endswith(".nii.gz"):
                image_id = os.path.splitext(file_name)[0]
                if image_id.endswith(".nii"):  # Handles .nii.gz double extension
                    image_id = os.path.splitext(image_id)[0]
                folder_images.add(image_id)

# --- Compare ---
only_in_csv = csv_images - folder_images
only_in_folder = folder_images - csv_images
both = csv_images & folder_images

print("Images only in CSV:", len(only_in_csv))
print("Images only in Folder:", len(only_in_folder))

if only_in_csv:
    print("Missing in folder:", only_in_csv)
if only_in_folder:
    print("Missing in CSV:", only_in_folder)


print("\nSummary:")
print(f"Total images in CSV: {len(csv_images)}")
print(f"Total images in Folder: {len(folder_images)}")
print(f"Total images in both: {len(both)}")


Images only in CSV: 0
Images only in Folder: 0

Summary:
Total images in CSV: 986
Total images in Folder: 986
Total images in both: 986
