# Data Splitting
This notebook tries to apply a Stratified Group K-Fold to the dataset to ensure that no data leakage occurs and that all classes are represented equally.

In [1]:
import pandas as pd
import os
from sklearn.model_selection import StratifiedGroupKFold
from pathlib import Path
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
path_to_patients_tsv = Path("/Users/LennartPhilipp/Desktop/Uni/Prowiss/Dateien/participants_04_09_24.tsv")
path_to_preprocessed_images = Path("/Users/LennartPhilipp/Desktop/Uni/Prowiss/Datensatz_RGB/preprocessed_n4_brainlesion_percentile_20240612-083743")
path_to_tfr = Path("/Users/LennartPhilipp/Desktop/Uni/Prowiss/Datensatz_RGB/regensburg_slices_tfrecords/all_pats_single_slice_gray")

In [6]:
training_patients = pd.read_csv(path_to_patients_tsv, sep="\t", index_col=False)

# drop patient sub-01383503
patient_to_drop_index = training_patients.index[training_patients["participant_id"] == "sub-01383503"]
training_patients.drop(index=patient_to_drop_index, inplace=True)
training_patients.reset_index(drop=True, inplace=True)
#training_patients = training_patients.sample(frac=1) #shuffles the dataset

print(f"Before removing unfit files: {len(training_patients)}")
patient_files_list = os.listdir(path_to_preprocessed_images)
for index, row in training_patients.iterrows():
    if training_patients["participant_id"][index] not in patient_files_list:
        training_patients.drop(index=index, inplace=True)

print(f"After removing unfit files: {len(training_patients)}")
training_patients.reset_index(drop=True, inplace=True)

# shuffle dataset
training_patients = training_patients.sample(frac=1).reset_index(drop=True)

patient_ids = training_patients["participant_id"].tolist()

Before removing unfit files: 472
After removing unfit files: 470


In [7]:
tfr_patients = [patient for patient in os.listdir(path_to_tfr) if os.path.isdir(os.path.join(path_to_tfr, patient))]

lesion_counter = {}

for patient in tfr_patients:
    patient_path = os.path.join(path_to_tfr, patient)
    lesions = [lesion for lesion in os.listdir(patient_path) if lesion.endswith(".tfrecord")]
    count = len(lesions)
    lesion_counter[patient] = count

In [8]:
print(lesion_counter)

training_patients["lesion"] = training_patients["participant_id"].map(lesion_counter)

{'sub-01812518': 2, 'sub-02095303': 1, 'sub-01729917': 1, 'sub-02155605': 5, 'sub-01892684': 6, 'sub-01104996': 1, 'sub-01710250': 2, 'sub-02092748': 25, 'sub-01883957': 8, 'sub-01381621': 2, 'sub-01979997': 4, 'sub-02038513': 1, 'sub-02148372': 8, 'sub-01398968': 4, 'sub-01702596': 1, 'sub-93002557': 1, 'sub-01452858': 1, 'sub-01009590': 1, 'sub-02122538': 1, 'sub-01699532': 2, 'sub-01589112': 1, 'sub-02095961': 17, 'sub-02110064': 2, 'sub-02145605': 1, 'sub-02031868': 3, 'sub-01652130': 2, 'sub-01122863': 6, 'sub-01108350': 1, 'sub-95001254': 1, 'sub-01691369': 1, 'sub-01969755': 3, 'sub-02018743': 9, 'sub-02137073': 1, 'sub-02094018': 12, 'sub-02120805': 6, 'sub-01262362': 1, 'sub-01905848': 1, 'sub-01871625': 5, 'sub-01805334': 1, 'sub-01706562': 1, 'sub-01164049': 1, 'sub-02055312': 2, 'sub-02167792': 3, 'sub-01409764': 2, 'sub-01281168': 2, 'sub-01018613': 1, 'sub-01616246': 1, 'sub-02188930': 4, 'sub-02115377': 0, 'sub-02031256': 2, 'sub-01798755': 2, 'sub-01654658': 1, 'sub-017

In [11]:
# primaries are written like this: 1e, 25, 13, 2f, etc., for the full list see config.py
# given a certain amount of classes, e.g. 3:
# create the following classes: lung, melanoma, other
primaries_array_dict = pd.Series(training_patients.primary.values, index=training_patients.participant_id).to_dict()

print("Original Primaries Dict:")
print(primaries_array_dict)
print()

def compress_primaries_dict(primaries_dict):

    #primaries_dict_copy = primaries_dict.copy()

    for key, value in primaries_dict.items():
        clean_primary = ''.join(filter(str.isdigit, value))
        primaries_dict[key] = clean_primary
    
    print(primaries_dict)

    for key, value in primaries_dict.items():
        primary_num = int(value)

        renamed_primary = 0

        # compress all the genitourinary cancers togehter (3-10)
        if primary_num >= 3 and primary_num <= 10:
            renamed_primary = 3
        # compress all the gastrointestinal cancers together (19-25)
        elif primary_num >= 19 and primary_num <= 25:
            renamed_primary = 19
        # compress all the head and neck cancers together (13-18)
        elif primary_num >= 13 and primary_num <= 18:
            renamed_primary = 13
        else:
            renamed_primary = primary_num

        primaries_dict[key] = renamed_primary
    
    print(primaries_dict)
    return primaries_dict


def return_specific_primaries_dict(primaries_dict, specific_classes, dataset):
    """
    Replace items in the primaries_array that are not in the specified classes with "other".
    
    Args:
    primaries_array (array): Array of primary classes.
    specific_classes (array): Array of specific classes to include.
    dataset (str): Name of the dataset ("Regensburg" or "Yale").
    
    Returns:
    array: Updated primaries_array with classes replaced as specified.
    """

    # go through the list and replace each iteam that is not in the specified classes with "other"

    if dataset == "Regensburg":

        for specific_class in specific_classes:

            match specific_class:
                case 1: # lung
                    
                    for key, value in primaries_dict.items():
                        if value == 1:
                            primaries_dict[key] = 101

                case 2: # melanoma

                    for key, value in primaries_dict.items():
                        if value == 12:
                            primaries_dict[key] = 102
                case 3: # breast

                    for key, value in primaries_dict.items():
                        if value == 2:
                            primaries_dict[key] = 103
                case 4: # genito urinary

                    for key, value in primaries_dict.items():
                        if value == 3:
                            primaries_dict[key] = 104
                case 5: # gastro

                    for key, value in primaries_dict.items():
                        if value == 19:
                            primaries_dict[key] = 105
                case _:
                    pass
    
    elif dataset == "Yale":

        for specific_class in specific_classes:

            match specific_class:
                case 1: # lung

                    for key, value in primaries_dict.items():
                        if value == 1:
                            primaries_dict[key] = 101
                    #primaries_array = primaries_array.replace(0, 101)
                case 2: # melanoma

                    for key, value in primaries_dict.items():
                        if value == 4:
                            primaries_dict[key] = 102
                    #primaries_array = primaries_array.replace(4, 102)
                case 3: # breast

                    for key, value in primaries_dict.items():
                        if value == 1:
                            primaries_dict[key] = 103
                    #primaries_array = primaries_array.replace(1, 103)
                case 4: # genito urinary, actually in this case only renal cell carcinoma
                    
                    for key, value in primaries_dict.items():
                        if value == 6:
                            primaries_dict[key] = 104
                    #primaries_array = primaries_array.replace(6, 104)
                case 5: # gastro

                    for key, value in primaries_dict.items():
                        if value == 2:
                            primaries_dict[key] = 105
                    #primaries_array = primaries_array.replace(2, 105)
                case _:
                    pass

    print(primaries_dict)

    for key, value in primaries_dict.items():
        if value < 100:
            primaries_dict[key] = 0

    print(primaries_dict)

    counter = 0

    for specific_class in specific_classes:
        specific_class = specific_class + 100
        counter += 1

        for key, value in primaries_dict.items():
            if value == specific_class:
                primaries_dict[key] = counter
    
    print(primaries_dict)

    return primaries_dict

print("Compress Primaries Dict Function output:")
compressed = compress_primaries_dict(primaries_array_dict)
print()

print("Return Specific Primaries Dict Function output:")
specific_primaries_dict = return_specific_primaries_dict(compressed, [1,2,3,4,5], "Regensburg")

print(pd.Series(specific_primaries_dict.values()).value_counts())

Original Primaries Dict:
{'sub-02094018': '1a', 'sub-01773716': '1e', 'sub-01119720': '12', 'sub-02094355': '12', 'sub-02012594': '19', 'sub-01461078': '4', 'sub-01882989': '12', 'sub-01071055': '5', 'sub-02064363': '1e', 'sub-01457167': '1a', 'sub-01548397': '2', 'sub-01985065': '5', 'sub-01717958': '1a', 'sub-02035864': '23', 'sub-01455312': '1f', 'sub-01946271': '21', 'sub-01953116': '23', 'sub-01707721': '1c', 'sub-01952689': '12', 'sub-80004059': '1c', 'sub-01696845': '1c', 'sub-01641960': '1e', 'sub-01589112': '11e', 'sub-01957247': '23', 'sub-02063986': '14', 'sub-01677324': '12', 'sub-01332588': '1f', 'sub-02183443': '1e', 'sub-93002557': '23', 'sub-01340749': '12', 'sub-01309950': '23', 'sub-01596127': '1a', 'sub-01854308': '1e', 'sub-01415245': '12', 'sub-01201482': '1c', 'sub-02088404': '1c', 'sub-02075769': '23', 'sub-01982853': '1c', 'sub-01703264': '9', 'sub-02010452': '1c', 'sub-90001992': '3', 'sub-01763867': '1c', 'sub-02124336': '1a', 'sub-01547588': '1c', 'sub-011523

In [12]:
training_patients["primary_compressed"] = training_patients["participant_id"].map(specific_primaries_dict)

In [13]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedGroupKFold

# Assuming you have a DataFrame `df` with columns `lesion_type`, `patient_id`
X = training_patients['primary_compressed']
y = training_patients['primary_compressed']
groups = training_patients['participant_id']

# Initialize the StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5)

# Initialize empty lists to hold the splits
train_idx, val_idx, test_idx = [], [], []

# Split into train/val/test
for train_idx_, temp_idx in sgkf.split(X, y, groups):
    # Assign the indices for the train set
    train_idx = train_idx_
    
    # Further split the temporary set into validation and test
    X_temp, y_temp, groups_temp = X.iloc[temp_idx], y.iloc[temp_idx], groups.iloc[temp_idx]
    sgkf_temp = StratifiedGroupKFold(n_splits=2)
    for val_idx_, test_idx_ in sgkf_temp.split(X_temp, y_temp, groups_temp):
        val_idx = temp_idx[val_idx_]
        test_idx = temp_idx[test_idx_]
        break  # We only need one split

# Extract the datasets based on the indices
X_train, y_train, groups_train = X.iloc[train_idx], y.iloc[train_idx], groups.iloc[train_idx]
X_val, y_val, groups_val = X.iloc[val_idx], y.iloc[val_idx], groups.iloc[val_idx]
X_test, y_test, groups_test = X.iloc[test_idx], y.iloc[test_idx], groups.iloc[test_idx]

# Step 1: Verify Group Integrity
def check_group_integrity(groups, train, val, test):
    all_groups = set(groups)
    train_groups = set(groups[train])
    val_groups = set(groups[val])
    test_groups = set(groups[test])
    
    if len(all_groups) == len(train_groups.union(val_groups).union(test_groups)):
        print("Group integrity maintained. No overlap between groups in different sets.")
    else:
        print("Group integrity issue detected! Some groups are split across sets.")

# Step 2: Verify Class Distribution
def check_class_distribution(y, train, val, test):
    train_distribution = Counter(y[train])
    val_distribution = Counter(y[val])
    test_distribution = Counter(y[test])
    
    print("Training set class distribution:", train_distribution)
    print("Total traing set samples:", sum(train_distribution.values()))
    print("Validation set class distribution:", val_distribution)
    print("Total validation set samples:", sum(val_distribution.values()))
    print("Test set class distribution:", test_distribution)
    print("Total test set samples:", sum(test_distribution.values()))

# Now, you can check the integrity and class distribution as before
check_group_integrity(groups, train_idx, val_idx, test_idx)
check_class_distribution(y, train_idx, val_idx, test_idx)

print(train_idx)
print(val_idx)
print(test_idx)
total = list(train_idx) + list(val_idx) + list(test_idx)
value_counts = pd.Series(total).value_counts()
for key, value in value_counts.items():
    if value > 1:
        print(f"Patient {key} has {value} samples in total.")

Group integrity maintained. No overlap between groups in different sets.
Training set class distribution: Counter({1: 204, 2: 59, 4: 33, 3: 31, 5: 26, 0: 23})
Total traing set samples: 376
Validation set class distribution: Counter({1: 25, 2: 7, 4: 5, 3: 4, 5: 3, 0: 3})
Total validation set samples: 47
Test set class distribution: Counter({1: 26, 2: 8, 4: 4, 3: 4, 5: 3, 0: 2})
Total test set samples: 47
[  0   1   2   3   4   5   6   7   8   9  12  13  14  15  16  17  18  19
  20  21  22  24  25  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  43  44  46  47  49  50  51  53  54  55  56  57  58  61  62  63  64
  65  66  68  69  70  71  72  73  74  76  77  80  81  82  83  84  86  88
  90  92  93  94  95  96  97  98 102 103 104 105 106 107 109 110 111 112
 113 114 115 116 117 118 119 120 124 125 126 127 128 129 132 133 134 135
 136 137 138 139 140 141 142 143 145 146 148 149 150 151 153 156 157 158
 160 161 162 163 164 167 168 169 171 172 173 174 176 177 178 180 181 183
 184 185 

In [14]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedGroupKFold, KFold
from sklearn.metrics import accuracy_score  # Replace with the appropriate metric

# Assuming you have a DataFrame `df` with columns `lesion_type`, `patient_id`
X = training_patients['primary_compressed']
y = training_patients['primary_compressed']
groups = training_patients['participant_id']

# Initialize the StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5)

# Iterate over the StratifiedGroupKFold splits
for train_idx_, temp_idx in sgkf.split(X, y, groups):
    # Assign the indices for the train set
    train_idx = train_idx_

    # Further split the temporary set into validation and test
    X_temp, y_temp, groups_temp = X.iloc[temp_idx], y.iloc[temp_idx], groups.iloc[temp_idx]
    sgkf_temp = StratifiedGroupKFold(n_splits=2)
    for val_idx_, test_idx_ in sgkf_temp.split(X_temp, y_temp, groups_temp):
        val_idx = temp_idx[val_idx_]
        test_idx = temp_idx[test_idx_]
        break  # We only need one split
    
    # Extract the datasets based on the indices
    X_train, y_train, groups_train = X.iloc[train_idx], y.iloc[train_idx], groups.iloc[train_idx]
    X_val, y_val, groups_val = X.iloc[val_idx], y.iloc[val_idx], groups.iloc[val_idx]
    X_test, y_test, groups_test = X.iloc[test_idx], y.iloc[test_idx], groups.iloc[test_idx]
    
    # Initialize KFold for cross-validation on the training data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # K-Fold cross-validation within the train set
    for fold, (cv_train_idx, cv_val_idx) in enumerate(kf.split(X_train, y_train)):
        X_cv_train, y_cv_train = X_train.iloc[cv_train_idx], y_train.iloc[cv_train_idx]
        X_cv_val, y_cv_val = X_train.iloc[cv_val_idx], y_train.iloc[cv_val_idx]
        
        print(f"Fold {fold + 1}:")
        print(f"X_cv_train: {X_cv_train.shape}, y_cv_train: {y_cv_train.shape}")
    

        # Train your model on X_cv_train, y_cv_train
        # model.fit(X_cv_train, y_cv_train)  # Replace with actual model training
        
        # Predict on the validation set
        # y_cv_pred = model.predict(X_cv_val)  # Replace with actual prediction
        
        # Calculate metrics (example using accuracy)
        # accuracy = accuracy_score(y_cv_val, y_cv_pred)
        # print(f"Fold {fold + 1} Accuracy: {accuracy}")

    # Step 1: Verify Group Integrity
    check_group_integrity(groups, train_idx, val_idx, test_idx)
    
    # Step 2: Verify Class Distribution
    check_class_distribution(y, train_idx, val_idx, test_idx)

# Example: After cross-validation, you could select the best model and evaluate on X_test, y_test
# final_model.fit(X_train, y_train)
# y_test_pred = final_model.predict(X_test)
# final_accuracy = accuracy_score(y_test, y_test_pred)
# print(f"Final Test Accuracy: {final_accuracy}")


Fold 1:
X_cv_train: (300,), y_cv_train: (300,)
Fold 2:
X_cv_train: (301,), y_cv_train: (301,)
Fold 3:
X_cv_train: (301,), y_cv_train: (301,)
Fold 4:
X_cv_train: (301,), y_cv_train: (301,)
Fold 5:
X_cv_train: (301,), y_cv_train: (301,)
Group integrity maintained. No overlap between groups in different sets.
Training set class distribution: Counter({1: 204, 2: 59, 4: 34, 3: 31, 5: 26, 0: 22})
Total traing set samples: 376
Validation set class distribution: Counter({1: 26, 2: 7, 4: 4, 3: 4, 5: 3, 0: 3})
Total validation set samples: 47
Test set class distribution: Counter({1: 25, 2: 8, 4: 4, 3: 4, 5: 3, 0: 3})
Total test set samples: 47
Fold 1:
X_cv_train: (300,), y_cv_train: (300,)
Fold 2:
X_cv_train: (301,), y_cv_train: (301,)
Fold 3:
X_cv_train: (301,), y_cv_train: (301,)
Fold 4:
X_cv_train: (301,), y_cv_train: (301,)
Fold 5:
X_cv_train: (301,), y_cv_train: (301,)
Group integrity maintained. No overlap between groups in different sets.
Training set class distribution: Counter({1: 204, 

In [16]:
def check_distribution(df, label):
    lesion_type_distribution = df['primary_compressed'].value_counts(normalize=True)
    lesion_count_distribution = df['lesion'].sum()
    print(f"{label} - Lesion Type Distribution:\n{lesion_type_distribution}\n")
    print(f"{label} - Total Patient Count: {len(df)}")
    print(f"{label} - Total Lesion Count: {lesion_count_distribution}\n")

# # Checking distributions
# check_distribution(train_df, "Training Set")
# check_distribution(val_df, "Validation Set")
# check_distribution(test_df, "Testing Set")

# print(len(list(train_df["participant_id"])))

## This code below should work

In [17]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

df = training_patients

# Step 1: Separate out the test set (10% of the data)
stratified_split_test = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

X = df['participant_id']
y = df['primary_compressed']

for temp_index, test_index in stratified_split_test.split(X, y):
    temp_df = df.iloc[temp_index]
    test_df = df.iloc[test_index]

# Step 2: Apply StratifiedKFold on the remaining data (temp_df) for 5-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

X_temp = temp_df['participant_id']
y_temp = temp_df['primary_compressed']

folds = []
for train_index, val_index in skf.split(X_temp, y_temp):
    train_df = temp_df.iloc[train_index]
    val_df = temp_df.iloc[val_index]
    folds.append((train_df, val_df))

# The resulting folds list contains 5 pairs of (train_df, val_df) datasets.
# The test_df remains untouched.
#check_distribution(test, "Testing Set")

counter = 0
for fold in folds:
    counter += 1
    print(f"Fold {counter}:")
    check_distribution(fold[0], "Training Set")
    check_distribution(fold[1], "Validation Set")

Fold 1:
Training Set - Lesion Type Distribution:
primary_compressed
1    0.542105
2    0.157895
4    0.089474
3    0.081579
5    0.071053
0    0.057895
Name: proportion, dtype: float64

Training Set - Total Patient Count: 380
Training Set - Total Lesion Count: 1537

Validation Set - Lesion Type Distribution:
primary_compressed
1    0.534884
2    0.162791
3    0.093023
4    0.093023
0    0.069767
5    0.046512
Name: proportion, dtype: float64

Validation Set - Total Patient Count: 43
Validation Set - Total Lesion Count: 131

Fold 2:
Training Set - Lesion Type Distribution:
primary_compressed
1    0.542105
2    0.157895
4    0.089474
3    0.084211
5    0.068421
0    0.057895
Name: proportion, dtype: float64

Training Set - Total Patient Count: 380
Training Set - Total Lesion Count: 1436

Validation Set - Lesion Type Distribution:
primary_compressed
1    0.534884
2    0.162791
4    0.093023
5    0.069767
0    0.069767
3    0.069767
Name: proportion, dtype: float64

Validation Set - Total 

To-do:
export .txt file of patient IDs\
use these for training

## Export patientIDs as .txt files

In [None]:
path_to_txt_folder = Path("/Users/LennartPhilipp/Desktop/Uni/Prowiss/Datensatz_RGB/regensburg_slices_tfrecords/split_text_files")

os.makedirs(path_to_txt_folder, exist_ok=True)

# save pat ids for testing
path_to_test_ids = path_to_txt_folder / "test_ids.txt"

test_file = open(path_to_test_ids, "w")
test_ids = test_df["participant_id"]
print(f"Test IDs amount: {len(test_ids)}")

for id in test_ids:
    test_file.write(f"{id}\n")

test_file.close()

counter = 0
for fold in folds:
    # save pat ids for traing and validation in txt files

    path_to_train_ids = path_to_txt_folder / f"fold_{counter}_train_ids.txt"
    path_to_val_ids = path_to_txt_folder / f"fold_{counter}_val_ids.txt"

    train_ids = fold[0]["participant_id"]
    val_ids = fold[1]["participant_id"]

    print(f"Train IDs amount for fold {counter}: {len(train_ids)} | Val IDs amount: {len(val_ids)}")


    train_file = open(path_to_train_ids, "w")
    for id in train_ids:
        train_file.write(f"{id}\n")

    train_file.close()

    val_file = open(path_to_val_ids, "w")
    for id in val_ids:
        val_file.write(f"{id}\n")

    val_file.close()
    
    counter += 1

Test IDs amount: 47
Train IDs amount for fold 0: 380 | Val IDs amount: 43
Train IDs amount for fold 1: 380 | Val IDs amount: 43
Train IDs amount for fold 2: 380 | Val IDs amount: 43
Train IDs amount for fold 3: 381 | Val IDs amount: 42
Train IDs amount for fold 4: 381 | Val IDs amount: 42
Train IDs amount for fold 5: 381 | Val IDs amount: 42
Train IDs amount for fold 6: 381 | Val IDs amount: 42
Train IDs amount for fold 7: 381 | Val IDs amount: 42
Train IDs amount for fold 8: 381 | Val IDs amount: 42
Train IDs amount for fold 9: 381 | Val IDs amount: 42
