In [1]:
import pandas as pd
import shutil
import os
from sklearn.model_selection import train_test_split

In [2]:
folder_path = '/home/habtamu/Substrate/'
img_dir = '/home/habtamu/data/substrate/OK_annotations_png/'

Split data to the training and validation set

In [14]:
def data_split(img_dir):
    csv_file = os.path.join(folder_path, "OK_annotations_GT.csv")
    df = pd.read_csv(csv_file)
    print ('Total number of images:', len(df))
    df['patient id'] = df['filename'].str.split('_').str[0]
    unique_patients = df['patient id'].nunique()
    print('Total number of unique patients:', unique_patients)

    duplicated_patients = df[df['patient id'].duplicated() == True]
    print('Patients with more than one image:', len(duplicated_patients))
    rows_with_duplicated_patients = df[df['patient id'].duplicated(keep=False)]
    print('Total number of images of duplicated patients:', len(rows_with_duplicated_patients))

    count = df['patient id'].value_counts() # Count the frequency of values in the column 'patient id'
    # Select only the rows where the count is equal to 1
    patients_with_single_image = df[df['patient id'].isin(count[count == 1].index)] 
    print('Patients with only one image:', len(patients_with_single_image))

    # Since 14 images will be added to the train_set, we set the percentage of the val_set to 21.6%
    training_patients, validation_patients = train_test_split(patients_with_single_image, test_size=0.216)
    print('Number of patients in the training set:', len(training_patients))
    print('Number of patients in the validation set:', len(validation_patients))

    frames = [training_patients, rows_with_duplicated_patients]
    total_train_set = pd.concat(frames)
    print("Total number of images in the training set:", len(total_train_set))

    total_train_set.to_csv("training_set.csv",index=False)
    validation_patients.to_csv("validation_set.csv", index=False)

In [15]:
data_split(img_dir)

Total number of images: 199
Total number of unique patients: 192
Patients with more than one image: 7
Total number of images of duplicated patients: 14
Patients with only one image: 185
Number of patients in the training set: 145
Number of patients in the validation set: 40
Total number of images in the training set: 159


Check if the split is patientwise

In [19]:
df_train = pd.read_csv("training_set.csv")
df_val = pd.read_csv("validation_set.csv")

n=0
for idx_val, row_val in df_val.iterrows():
    for idx_train, row_train in df_train.iterrows():
        if row_val['patient id'] == row_train['patient id']:
            n = n + 1

In [20]:
print(n)

0


Copy training and validation set annotations to the separate directories

In [None]:
for idx, row in df_train.iterrows():
    filename = row["filename"]
    new_file_name = os.path.join("/home/habtamu/Substrate/annotations", filename.replace(".png", ".json"))
    shutil.copy(new_file_name, '/home/habtamu/Substrate/dataset_dir/train/')

for idx, row in df_val.iterrows():
    filename = row["filename"]
    new_file_name = os.path.join("/home/habtamu/Substrate/annotations", filename.replace(".png", ".json"))
    shutil.copy(new_file_name, '/home/habtamu/Substrate/dataset_dir/val/')