In [1]:
import os
from collections import defaultdict
import numpy as np

import pandas as pd


In [2]:
# Get all files
files = os.listdir('/home/fabian/raid5/schuerch_dataset/preprocessed/full')
files = [f for f in files if f.endswith('.hdf')]

# Extract unique patient numbers and group files
patient_groups = defaultdict(list)
for file in files:
    # Extract patient number (e.g., '006' from 'reg006_A.hdf')
    patient_num = file.split('_')[0][3:]  # removes 'reg' prefix
    patient_groups[patient_num].append(file)

# Get unique patient numbers
unique_patients = list(patient_groups.keys())
n_patients = len(unique_patients)

# Calculate split sizes
n_train = int(n_patients * 0.7)
n_val = int(n_patients * 0.15)
n_test = n_patients - n_train - n_val

# Randomly split patients
np.random.seed(42)  # for reproducibility
patients_shuffled = np.random.permutation(unique_patients)
train_patients = patients_shuffled[:n_train]
val_patients = patients_shuffled[n_train:n_train+n_val]
test_patients = patients_shuffled[n_train+n_val:]

# Create the final splits
train_files = []
val_files = []
test_files = []

for patient in train_patients:
    train_files.extend(patient_groups[patient])
for patient in val_patients:
    val_files.extend(patient_groups[patient])
for patient in test_patients:
    test_files.extend(patient_groups[patient])

# Print summary
print(f"Total patients: {n_patients}")
print(f"Train patients: {len(train_patients)} ({len(train_files)} files)")
print(f"Val patients: {len(val_patients)} ({len(val_files)} files)")
print(f"Test patients: {len(test_patients)} ({len(test_files)} files)")

# Save splits to files
# with open('train_split.txt', 'w') as f:
#     f.write('\n'.join(sorted(train_files)))
# with open('val_split.txt', 'w') as f:
#     f.write('\n'.join(sorted(val_files)))
# with open('test_split.txt', 'w') as f:
#     f.write('\n'.join(sorted(test_files)))

Total patients: 67
Train patients: 46 (74 files)
Val patients: 10 (16 files)
Test patients: 11 (19 files)


In [3]:
# Create a list of (filename, split) tuples
split_entries = []

for file in train_files:
    split_entries.append((file, 'train'))
for file in val_files:
    split_entries.append((file, 'valid'))
for file in test_files:
    split_entries.append((file, 'test'))

# Create DataFrame and save to CSV
df = pd.DataFrame(split_entries, columns=['sample_name', 'train_test_val_split'])
df = df.sort_values('sample_name')  # Optional: sort by filename

# Save to CSV
# df.to_csv('/home/fabian/raid5/schuerch_dataset/splits/schuerch_dataset_split.csv', index=False)

In [4]:
import lmdb
import pickle
import numpy as np

# Open the LMDB environment
env = lmdb.open("/home/fabian/raid5/schuerch_dataset/schuerch_dataset_lmdb/lmdb/", readonly=True)

# Keep track of all unique semantic mask values
all_unique_values = set()

with env.begin() as txn:
    cursor = txn.cursor()
    for key, value in cursor:
        # Deserialize the data
        tile_dict = pickle.loads(value)
        # Get semantic mask and find unique values
        semantic_mask = tile_dict['semantic_mask']
        unique_values = np.unique(semantic_mask)
        all_unique_values.update(unique_values.tolist())

print("All unique semantic mask values found:", sorted(all_unique_values))

All unique semantic mask values found: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [5]:
len(all_unique_values)

15