In [1]:
import pandas as pd
import os

df = pd.read_excel("../../src/combined_labels_with_patient_id.xlsx")
df.head()


Unnamed: 0,ID,Binary TNM Stage,Binary Stage T,3-class Stage T,Binary Stage N,Stage M,MSI,KRAS,NRAS,BRAF,Vital Status,Overall Survival (months),Cohort,De-ID,died_within_5_years,Patient ID
0,SL-1,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,Dead,32.0,RIH,SL-1,1.0,SL-1
1,SL-2,0.0,1.0,1.0,0.0,0.0,,,,,Dead,50.0,RIH,SL-2,1.0,SL-2
2,SL-3,0.0,1.0,2.0,0.0,0.0,0.0,,,,Alive,59.0,RIH,SL-3,0.0,SL-3
3,SL-4,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,Alive,57.0,RIH,SL-4,0.0,SL-4
4,SL-5,1.0,1.0,1.0,1.0,0.0,1.0,,,,Alive,56.0,RIH,SL-5,0.0,SL-5


In [2]:
surgen_df = df[df['Cohort'].str.startswith('SR')]

In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split

# filter the dataframe based on the 'Cohort' column
training_corhort = "SR" # SR, TCGA, ALL

for task in ["Binary TNM Stage", "Binary Stage T", "3-class Stage T", "Binary Stage N", "Stage M", "MSI", "KRAS", "NRAS", "BRAF", "died_within_5_years"]:

    new_df = surgen_df.copy()
        
    # drop all nans in the task column
    new_df = new_df.dropna(subset=[task])
    
    # print the value counts of the task column
    print(f"Value counts for {task}:")
    print(new_df[task].value_counts())

    # remove rows with "Cohort" == RIH" and De-ID start with SL-3
    # new_df = new_df[~((new_df['Cohort'] == 'RIH') & (new_df['De-ID'].str.startswith('SL-3')))]
    train, val = train_test_split(new_df, test_size=0.2, random_state=42, stratify=new_df[task])
    
    # 1) Initialize k-fold column
    new_df["k_fold"] = -1

    # 2) Only perform k-fold on samples where Cohort is not RIH
    mask = new_df["De-ID"].isin(train["De-ID"])
    df_train = new_df[mask].copy()
    
    # stratification labels
    y = df_train[task].values

    # 3) Prepare index mapping back to original df
    train_indices = df_train.index

    # 4) Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (_, val_idx) in enumerate(skf.split(df_train, y)):
        val_original_idx = train_indices[val_idx]     # map back to original df rows
        new_df.loc[val_original_idx, "k_fold"] = fold

    # 5) Inspect results
    print(f"K-Fold distribution for {task}:")
    print(new_df["k_fold"].value_counts())
    print("\n")
    # only select the DE ID, WHO 2022, WHO 1973, and Binary WHO 2022 columns and k_fold column

    new_df = new_df[["De-ID", "Patient ID", "Cohort", task, "k_fold"]]
    new_df.to_csv(f'../../src/SurGen_{task}_k_fold.csv', index=False)

Value counts for Binary TNM Stage:
Binary TNM Stage
1.0    418
0.0    296
Name: count, dtype: int64
K-Fold distribution for Binary TNM Stage:
k_fold
-1    143
 0    115
 2    114
 4    114
 3    114
 1    114
Name: count, dtype: int64


Value counts for Binary Stage T:
Binary Stage T
1.0    611
0.0    102
Name: count, dtype: int64
K-Fold distribution for Binary Stage T:
k_fold
-1    143
 1    114
 0    114
 4    114
 3    114
 2    114
Name: count, dtype: int64


Value counts for 3-class Stage T:
3-class Stage T
1.0    351
2.0    260
0.0    102
Name: count, dtype: int64
K-Fold distribution for 3-class Stage T:
k_fold
-1    143
 1    114
 3    114
 0    114
 4    114
 2    114
Name: count, dtype: int64


Value counts for Binary Stage N:
Binary Stage N
1.0    400
0.0    304
Name: count, dtype: int64
K-Fold distribution for Binary Stage N:
k_fold
-1    141
 0    113
 1    113
 2    113
 4    112
 3    112
Name: count, dtype: int64


Value counts for Stage M:
Stage M
0.0    626
1.0     88
