In [None]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

# Old Split

In [None]:
# Paths
all_images_dir = "/mnt/datassd0/chest-xray/data/images/all_images"
output_dir = "/mnt/datassd0/chest-xray/data/train_val_test"
os.makedirs(output_dir, exist_ok=True)

# Output folders to optionally save images in splits

# train_dir = os.path.join(output_dir, "train")
# val_dir = os.path.join(output_dir, "val")
# test_dir = os.path.join(output_dir, "test")

# for d in [train_dir, val_dir, test_dir]:
#     os.makedirs(d, exist_ok=True)

# Read file lists
with open("/mnt/datassd0/chest-xray/data/test_list.txt", "r") as f:
    test_files = [line.strip() for line in f.readlines()]

with open("/mnt/datassd0/chest-xray/data/train_val_list.txt", "r") as f:
    train_val_files = [line.strip() for line in f.readlines()]

# Determine split ratio
val_ratio_from_remaining = 10 / (70 + 10)
val_size = int(len(train_val_files) * val_ratio_from_remaining)

# Shuffle + split
random.seed(42)
random.shuffle(train_val_files)
val_files = train_val_files[:val_size]
train_files = train_val_files[val_size:]

# Function to copy and save list
def copy_and_save(file_list, list_path, dest_img_dir=None):
    with open(list_path, "w") as f:
        for fname in file_list:
            f.write(fname + "\n")
            # src_path = os.path.join(all_images_dir, fname)
            # dst_path = os.path.join(dest_img_dir, fname)
            # if os.path.exists(src_path):
            #     shutil.copy2(src_path, dst_path)  
            # else:
            #     print(f"Warning: {fname} not found in all_images.")

# Copy and save
copy_and_save(train_files, os.path.join(output_dir, "train.txt"))
copy_and_save(val_files, os.path.join(output_dir, "val.txt"))
copy_and_save(test_files, os.path.join(output_dir, "test.txt"))

# Summary
print(f"Train: {len(train_files)} images")
print(f"Val:   {len(val_files)} images")
print(f"Test:  {len(test_files)} images")

# New Patient Split

In [None]:
# Load full CSV file
df = pd.read_csv("/mnt/datassd0/chest-xray/data/Data_Entry_2017_v2020.csv")  # must contain columns: Image Index, patient_id

# Get unique patient IDs
unique_patients = df['Patient ID'].unique()
print(f"Total unique patients: {len(unique_patients)}")

# Split into 70% train, 30% temp (which will be split to val and test)
train_patients, temp_patients = train_test_split(unique_patients, test_size=0.30, random_state=42)

# Split temp (30%) into val and test equally (15% each)
val_patients, test_patients = train_test_split(temp_patients, test_size=0.5, random_state=42)

# Filter images based on patient groups
train_df = df[df['Patient ID'].isin(train_patients)]
val_df = df[df['Patient ID'].isin(val_patients)]
test_df = df[df['Patient ID'].isin(test_patients)]

# Write image filenames to separate txt files
train_df['Image Index'].to_csv("/mnt/datassd0/chest-xray/data/train_val_test_new/train.txt", index=False, header=False)
val_df['Image Index'].to_csv("/mnt/datassd0/chest-xray/data/train_val_test_new/val.txt", index=False, header=False)
test_df['Image Index'].to_csv("/mnt/datassd0/chest-xray/data/train_val_test_new/test.txt", index=False, header=False)

print(f"Train patients: {len(train_patients)}, Val: {len(val_patients)}, Test: {len(test_patients)}")
print(f"Train images: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
