In [3]:
import pandas as pd
import numpy as np

# Read the CSV file
input = r'd:\Kananat\Data\alldata_new_normalized\labels.csv'
df = pd.read_csv(input)

# Extract patient ID from the id column
# Assumes format: 'patient_id_L/R_Year' or 'patient_id_L/R'
df['patient_id'] = df['ID'].str.split(' ').str[0]

# Get unique patient IDs
unique_patients = df['patient_id'].unique()

# Shuffle patient IDs for random assignment
np.random.seed(1)  # Set seed for reproducibility
shuffled_patients = np.random.permutation(unique_patients)

# Calculate split indices
n_patients = len(shuffled_patients)
train_end = int(0.7 * n_patients)
val_end = int(0.9 * n_patients)

# Assign patients to splits
train_patients = set(shuffled_patients[:train_end])
val_patients = set(shuffled_patients[train_end:val_end])
test_patients = set(shuffled_patients[val_end:])

# Create split column based on patient ID
df['split'] = df['patient_id'].apply(
    lambda pid: 'train' if pid in train_patients 
    else ('val' if pid in val_patients else 'test')
)

# Optional: Remove the temporary patient_id column if you don't need it
df = df.drop('patient_id', axis=1)

# Save the result
df.to_csv(input, index=False)

# Print split statistics
print("Split distribution:")
print(f"Train: {len(train_patients)}")
print(f"Val: {len(val_patients)}")
print(f"Test: {len(test_patients)}")
df['split'].value_counts()

Split distribution:
Train: 275
Val: 79
Test: 40


In [None]:
OA_positive = df[df['OA'] == 1]
train_df = OA_positive[OA_positive['split'] == 'train']
val_df = OA_positive[OA_positive['split'] == 'val']
test_df = OA_positive[OA_positive['split'] == 'test']

In [None]:
cols = ['erosion', 'subCyst', 'genSclerosis', 'osteophyte', 'flattening']

print("TRAIN")
split_df = train_df
score = 0
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    split_score = 1-abs(pos-neg)/sum(pos_neg)
    score += split_score
    print(f"{col}: {pos_neg}, diff = {split_score:.2f}")
print(score/5)


print("\nVAL")
split_df = val_df
score = 0
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    split_score = 1-abs(pos-neg)/sum(pos_neg)
    score += split_score
    print(f"{col}: {pos_neg}, diff = {split_score:.2f}")
print(score/5)

print("\nTEST")
split_df = test_df
score = 0
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    split_score = 1-abs(pos-neg)/sum(pos_neg)
    score += split_score
    print(f"{col}: {pos_neg}, diff = {split_score:.2f}")
print(score/5)

In [None]:
from pathlib import Path
import pandas as pd

labels_csv = r"d:\Kananat\Data\alldata_new_normalized\labels.csv"
dataset_source = Path(r"D:\Kananat\Data\alldata_new_normalized")
dataset_target_folder = Path(r"d:\Kananat\Data\Last0\OA")

df = pd.read_csv(labels_csv)
task = 'OA'
check = []

for index, row in df.iterrows():
    select = f"{row['ID'].split(' ')[0]}"
    check.append([select, row['split']])

def check_consistent_splits(data):
    id_to_splits = {}
    inconsistent_ids = []
    
    for id_val, split_val in data:
        if id_val in id_to_splits:
            # If this ID already exists, check if split is different
            if id_to_splits[id_val] != split_val:
                inconsistent_ids.append(id_val)
        else:
            # First time seeing this ID
            id_to_splits[id_val] = split_val
    
    return inconsistent_ids

inconsistent = check_consistent_splits(check)

if inconsistent:
    print(f"IDs with inconsistent splits: {inconsistent}")
else:
    print("All IDs have consistent split values")

In [None]:
dataset_source = Path(r"D:\Kananat\Data\Last0\3_Preprocessed")

for file in dataset_source.glob("*.nii.gz"):
    print(file.name)

In [None]:
from pathlib import Path
import pandas as pd
import shutil
import os

labels_csv = r"D:\Kananat\Data\Last0\labels.csv"
dataset_source = Path(r"D:\Kananat\Data\Last0\3_Preprocessed")
dataset_target_folder = Path(r"D:\Kananat\Data\Last0")

df = pd.read_csv(labels_csv)

task = 'OA'
splits = ["train", "val", "test"]
classes = ["0", "1"]

for split in splits:
    for cls in classes:
        dir_path = dataset_target_folder / task / split / cls
        dir_path.mkdir(parents=True, exist_ok=True)

for index, row in df.iterrows():
    file = f"{row['ID']}_preprocessed.nii.gz"
    split = str(row['split'])
    label = str(row[task])
    selected_files = list(dataset_source.glob(file))

    if len(selected_files) != 1:
        print("="*40)
        print(f"Skipping ID: {row['ID']}, found {len(list(selected_files))} files: {list(selected_files)}")
        print("="*40)
        continue
    
    move_to = dataset_target_folder / task / split / label / file
    # print(selected_files[0])

    print(f"Copying [{file}, {split}, {label}] to {move_to}")

    # if os.path.isfile(selected_files[0]):
    #     print(f"File exists: {selected_files[0]}")

    shutil.copy2(selected_files[0], move_to)

Copying [47-4881 L 2014_preprocessed.nii.gz, train, 1] to D:\Kananat\Data\Last0\OA\train\1\47-4881 L 2014_preprocessed.nii.gz
Copying [47-4881 R 2014_preprocessed.nii.gz, train, 1] to D:\Kananat\Data\Last0\OA\train\1\47-4881 R 2014_preprocessed.nii.gz
Copying [54-21497 L 2014_preprocessed.nii.gz, val, 1] to D:\Kananat\Data\Last0\OA\val\1\54-21497 L 2014_preprocessed.nii.gz
Copying [54-21497 R 2014_preprocessed.nii.gz, val, 0] to D:\Kananat\Data\Last0\OA\val\0\54-21497 R 2014_preprocessed.nii.gz
Copying [57-10397 L 2015_preprocessed.nii.gz, train, 0] to D:\Kananat\Data\Last0\OA\train\0\57-10397 L 2015_preprocessed.nii.gz
Copying [57-10397 R 2015_preprocessed.nii.gz, train, 0] to D:\Kananat\Data\Last0\OA\train\0\57-10397 R 2015_preprocessed.nii.gz
Copying [54-21497 L 2016_preprocessed.nii.gz, val, 1] to D:\Kananat\Data\Last0\OA\val\1\54-21497 L 2016_preprocessed.nii.gz
Copying [54-21497 R 2016_preprocessed.nii.gz, val, 0] to D:\Kananat\Data\Last0\OA\val\0\54-21497 R 2016_preprocessed.nii

In [15]:
OA_neg = df[df['OA'] == 0]
OA_neg

Unnamed: 0,ID,OA,erosion,subCyst,genSclerosis,osteophyte,flattening,split
3,54-21497 R 2014,0,0,0,0,0,1,train
4,57-10397 L 2015,0,0,0,0,0,1,val
5,57-10397 R 2015,0,0,0,0,0,0,train
7,54-21497 R 2016,0,0,0,0,0,0,train
12,54-50 L,0,0,0,0,0,1,train
...,...,...,...,...,...,...,...,...
383,50-11620 R,0,0,0,0,0,0,train
384,50-11620 L,0,0,0,0,0,1,val
385,67-24015 R,0,0,0,0,0,1,train
387,68-700050 R,0,0,0,0,0,1,val


In [4]:
OA_positive = df[df['OA'] == 1]
train_df = OA_positive[OA_positive['split'] == 'train']
val_df = OA_positive[OA_positive['split'] == 'val']
test_df = OA_positive[OA_positive['split'] == 'test']

In [14]:
cols = ['erosion', 'subCyst', 'genSclerosis', 'osteophyte', 'flattening']

print("TRAIN")
split_df = train_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

print("\nVAL")
split_df = val_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

print("\nTEST")
split_df = test_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

TRAIN
erosion: [149, 17], diff = 0.20481927710843373
subCyst: [80, 86], diff = 0.963855421686747
genSclerosis: [50, 116], diff = 0.6024096385542168
osteophyte: [126, 40], diff = 0.4819277108433735
flattening: [115, 51], diff = 0.6144578313253012

VAL
erosion: [43, 2], diff = 0.0888888888888889
subCyst: [21, 24], diff = 0.9333333333333333
genSclerosis: [9, 36], diff = 0.4
osteophyte: [33, 12], diff = 0.5333333333333333
flattening: [27, 18], diff = 0.8

TEST
erosion: [26, 1], diff = 0.07407407407407407
subCyst: [16, 11], diff = 0.8148148148148149
genSclerosis: [10, 17], diff = 0.7407407407407407
osteophyte: [23, 4], diff = 0.2962962962962963
flattening: [21, 6], diff = 0.4444444444444444
