In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV with metadata (e.g., recurrence.csv)
df = pd.read_csv("/Users/harrydo/Documents/UTS/Spring24/Ilab/archive-6/recurrence.csv", low_memory=False)  # Update with your actual path

# List all images in your ROI folder (assuming they are in .tiff format)
image_dir = "/Users/harrydo/Documents/UTS/Spring24/Ilab/archive-6/ROI"  # Update with your actual path to the ROI folder
image_files = [f for f in os.listdir(image_dir) if f.endswith('.tiff')]

# Extract the SOPInstanceUID from image filenames (removing _1, _2, etc.)
filtered_images = [os.path.splitext(f)[0].split('_')[0] for f in image_files]  # Strip out suffixes like _1, _2, and file extensions

# Check the first few filenames after extraction
print(f"First few filenames after extraction: {filtered_images[:5]}")  # Debug

# Now filter the dataframe to include only rows where the SOPInstanceUID matches the base filename in the ROI folder
df_images = df[df['SOPInstanceUID'].isin(filtered_images)]

# Check if any data remains after filtering
print(f"Number of images found in ROI folder: {len(filtered_images)}")
print(f"Number of rows in filtered dataframe: {len(df_images)}")

# If the filtered DataFrame is empty, this could indicate a mismatch between filenames and SOPInstanceUIDs
if df_images.empty:
    print("No matching images found. Please check if the image filenames and SOPInstanceUID match.")
else:
    # Check the distribution of the target variable 'Recurrence'
    print(df_images['Recurrence'].value_counts())

    # Perform a stratified train-test-validation split based on 'Recurrence'
    train_val, test = train_test_split(df_images, test_size=0.15, stratify=df_images['Recurrence'], random_state=42)
    train, val = train_test_split(train_val, test_size=0.15, stratify=train_val['Recurrence'], random_state=42)

    # Save the splits to CSV files containing image paths and corresponding labels
    train.to_csv("/Users/harrydo/Documents/UTS/Spring24/Ilab/archive-6/train.csv", index=False)  # Update with your actual path
    val.to_csv("/Users/harrydo/Documents/UTS/Spring24/Ilab/archive-6/val.csv", index=False)      # Update with your actual path
    test.to_csv("/Users/harrydo/Documents/UTS/Spring24/Ilab/archive-6/test.csv", index=False)    # Update with your actual path

    print(f"Training set size: {len(train)}")
    print(f"Validation set size: {len(val)}")
    print(f"Test set size: {len(test)}")


First few filenames after extraction: ['1.3.6.1.4.1.14519.5.2.1.4334.1501.254560949327495733867144438706', '1.3.6.1.4.1.14519.5.2.1.4334.1501.431417775957188292593442155881', '1.3.6.1.4.1.14519.5.2.1.4334.1501.207443943407455870340595435813', '1.3.6.1.4.1.14519.5.2.1.4334.1501.153713896655134488487304641509', '1.3.6.1.4.1.14519.5.2.1.4334.1501.165554541812737547045545126699']
Number of images found in ROI folder: 720
Number of rows in filtered dataframe: 144
Recurrence
no     104
yes     40
Name: count, dtype: int64
Training set size: 103
Validation set size: 19
Test set size: 22


In [6]:
print("Recurrence distribution in training set:")
print(train['Recurrence'].value_counts())

print("\nRecurrence distribution in validation set:")
print(val['Recurrence'].value_counts())

print("\nRecurrence distribution in test set:")
print(test['Recurrence'].value_counts())

Recurrence distribution in training set:
Recurrence
no     74
yes    29
Name: count, dtype: int64

Recurrence distribution in validation set:
Recurrence
no     14
yes     5
Name: count, dtype: int64

Recurrence distribution in test set:
Recurrence
no     16
yes     6
Name: count, dtype: int64
