In [3]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv(r'C:\Users\acer\Desktop\Project_TMJOA\Data\Labels.csv')

# Extract patient ID from the id column
# Assumes format: 'patient_id_L/R_Year' or 'patient_id_L/R'
df['patient_id'] = df['ID'].str.split('_').str[0]

# Get unique patient IDs
unique_patients = df['patient_id'].unique()

# Shuffle patient IDs for random assignment
np.random.seed(42)  # Set seed for reproducibility
shuffled_patients = np.random.permutation(unique_patients)

# Calculate split indices
n_patients = len(shuffled_patients)
train_end = int(0.7 * n_patients)
val_end = int(0.9 * n_patients)

# Assign patients to splits
train_patients = set(shuffled_patients[:train_end])
val_patients = set(shuffled_patients[train_end:val_end])
test_patients = set(shuffled_patients[val_end:])

# Create split column based on patient ID
df['split'] = df['patient_id'].apply(
    lambda pid: 'train' if pid in train_patients 
    else ('val' if pid in val_patients else 'test')
)

# Optional: Remove the temporary patient_id column if you don't need it
df = df.drop('patient_id', axis=1)

# Save the result
#df.to_csv('Labels_with_split.csv', index=False)

# Print split statistics
print("Split distribution:")
print(f"Train: {len(train_patients)}")
print(f"Val: {len(val_patients)}")
print(f"Test: {len(test_patients)}")

Split distribution:
Train: 275
Val: 79
Test: 40


In [15]:
OA_neg = df[df['OA'] == 0]
OA_neg

Unnamed: 0,ID,OA,erosion,subCyst,genSclerosis,osteophyte,flattening,split
3,54-21497 R 2014,0,0,0,0,0,1,train
4,57-10397 L 2015,0,0,0,0,0,1,val
5,57-10397 R 2015,0,0,0,0,0,0,train
7,54-21497 R 2016,0,0,0,0,0,0,train
12,54-50 L,0,0,0,0,0,1,train
...,...,...,...,...,...,...,...,...
383,50-11620 R,0,0,0,0,0,0,train
384,50-11620 L,0,0,0,0,0,1,val
385,67-24015 R,0,0,0,0,0,1,train
387,68-700050 R,0,0,0,0,0,1,val


In [4]:
OA_positive = df[df['OA'] == 1]
train_df = OA_positive[OA_positive['split'] == 'train']
val_df = OA_positive[OA_positive['split'] == 'val']
test_df = OA_positive[OA_positive['split'] == 'test']

In [14]:
cols = ['erosion', 'subCyst', 'genSclerosis', 'osteophyte', 'flattening']

print("TRAIN")
split_df = train_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

print("\nVAL")
split_df = val_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

print("\nTEST")
split_df = test_df
for col in cols:
    pos = (split_df[col] == 1).sum()
    neg = (split_df[col] == 0).sum()
    pos_neg = [pos, neg]
    print(f"{col}: {pos_neg}, diff = {1-abs(pos-neg)/sum(pos_neg)}")

TRAIN
erosion: [149, 17], diff = 0.20481927710843373
subCyst: [80, 86], diff = 0.963855421686747
genSclerosis: [50, 116], diff = 0.6024096385542168
osteophyte: [126, 40], diff = 0.4819277108433735
flattening: [115, 51], diff = 0.6144578313253012

VAL
erosion: [43, 2], diff = 0.0888888888888889
subCyst: [21, 24], diff = 0.9333333333333333
genSclerosis: [9, 36], diff = 0.4
osteophyte: [33, 12], diff = 0.5333333333333333
flattening: [27, 18], diff = 0.8

TEST
erosion: [26, 1], diff = 0.07407407407407407
subCyst: [16, 11], diff = 0.8148148148148149
genSclerosis: [10, 17], diff = 0.7407407407407407
osteophyte: [23, 4], diff = 0.2962962962962963
flattening: [21, 6], diff = 0.4444444444444444
