# Creation of 10 shuffles of 5-fold splits 

Identify complete samples, which have all input data types and risk labels

Divide complete samples equally into 5 portions (137 or 138 samples)

Stratified by risk label 

Assign remaining samples to trainign data

Example output: 

`data/splits/1/train_1.txt`

`data/splits/8/test_5.txt`

Data types considered:

FHR, gene exp, broad cn, mutations (**no gene fusion**)

In [76]:
import os
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold,train_test_split

In [2]:
labels = pd.read_csv('/home/jiageng/Documents/fhr/annotations/fhr-annotations.2Mar25.tsv',sep='\t')[['PUBLIC_ID','risk']]
public_ids_labels = labels.loc[labels['risk']!=-1]['PUBLIC_ID']
print(len(public_ids_labels))

880


In [3]:
public_ids_gene_exp = pd.read_csv('/home/jiageng/Documents/fhr/matrices/gene_exp_matrix_k20.tsv',sep='\t')['PUBLIC_ID']
print(len(public_ids_gene_exp))

806


In [4]:
public_ids_broad_cn = pd.read_csv('/home/jiageng/Documents/fhr/matrices/broad_cn_matrix.tsv',sep='\t')['PUBLIC_ID']
print(len(public_ids_broad_cn))

924


In [5]:
public_ids_mut = pd.read_csv('/home/jiageng/Documents/fhr/matrices/gene_mut_matrix_fdr.tsv',sep='\t')['PUBLIC_ID']
print(len(public_ids_mut))

974


In [6]:
public_ids_canonical_ig = pd.read_csv('/home/jiageng/Documents/fhr/matrices/canonical_ig_translocations.tsv',sep='\t').index.tolist()
print(len(public_ids_canonical_ig))

906


In [7]:
public_ids_gene_fusion = pd.read_csv('/home/jiageng/Documents/fhr/matrices/gene_fusion_matrix.tsv',sep='\t')['PUBLIC_ID']
print(len(public_ids_gene_fusion))

798


In [8]:
public_ids_cn_segment = pd.read_csv('/home/jiageng/Documents/fhr/matrices/segment_cn_matrix_uncorrelated.tsv',sep='\t').index.tolist()
print(len(public_ids_cn_segment))

924


In [9]:
public_ids_all = \
    set(labels.PUBLIC_ID).union(
        set(public_ids_gene_exp),
        set(public_ids_broad_cn),
        set(public_ids_mut),
        set(public_ids_canonical_ig),
        set(public_ids_cn_segment),
        # set(public_ids_gene_fusion) # 1170 with or without gene fusion
    )
print(len(public_ids_all))

1170


In [10]:
public_ids_common = \
    set(public_ids_labels).intersection(
    set(public_ids_gene_exp),
    set(public_ids_broad_cn),
    set(public_ids_mut),
    set(public_ids_canonical_ig),
    set(public_ids_cn_segment),
    # set(public_ids_gene_fusion) # 668 without gene fusion, 664 with gene fusion
)
print(len(public_ids_common))

668


In [11]:
labels_all = labels.set_index('PUBLIC_ID').reindex(public_ids_all).fillna(-1).astype(int).reset_index('PUBLIC_ID')

## Option 1 - a dataset with complete and incomplete samples

In [29]:
labels_ann = labels_all.copy() # results
labels_valid = labels_all.query('PUBLIC_ID in @public_ids_common').reset_index(drop=True) # temporary

In [47]:
splitter = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10, random_state=42)
for i, (_, test_ind) in enumerate(splitter.split(labels_valid[['risk']],labels_valid['risk'])):
    public_ids_valid = labels_valid.loc[test_ind,'PUBLIC_ID']
    labels_ann[f'{i//5 + 1}_{i%5 + 1}'] = False 
    labels_ann.loc[labels_ann['PUBLIC_ID'].isin(public_ids_valid),f'{i//5 + 1}_{i%5 + 1}'] = True

In [52]:
labels_ann.groupby(['6_1','6_2','6_3','6_4','6_5']).size()

6_1    6_2    6_3    6_4    6_5  
False  False  False  False  False    497
                            True     134
                     True   False    134
              True   False  False    135
       True   False  False  False    135
True   False  False  False  False    135
dtype: int64

Write public ids to text files

In [53]:
for shuffle in range(1,11):
    for fold in range(1,6):
        labels_ann.loc[labels_ann[f'{shuffle}_{fold}']]['PUBLIC_ID'].to_csv(f'../data/splits/{shuffle}/valid_{fold}.txt',index=False)
        labels_ann.loc[~labels_ann[f'{shuffle}_{fold}']]['PUBLIC_ID'].to_csv(f'../data/splits/{shuffle}/train_{fold}.txt',index=False)

In [72]:
labels_ann.groupby(['1_1'])['risk'].value_counts()

1_1    risk
False   0      451
       -1      286
        1      221
        2       77
True    0       77
        1       42
        2       16
Name: count, dtype: int64

In [75]:
labels_ann.groupby(['7_4'])['risk'].value_counts()

7_4    risk
False   0      452
       -1      286
        1      221
        2       77
True    0       76
        1       42
        2       16
Name: count, dtype: int64

## Option 2 - a dataset with only complete samples

In [30]:
labels_valid = labels_all.query('PUBLIC_ID in @public_ids_common').reset_index(drop=True) # reference
labels_ann = labels_valid.copy() # results 

In [31]:
splitter = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10, random_state=42)
for i, (_, test_ind) in enumerate(splitter.split(labels_valid[['risk']],labels_valid['risk'])):
    public_ids_valid = labels_valid.loc[test_ind,'PUBLIC_ID']
    labels_ann[f'{i//5 + 1}_{i%5 + 1}'] = False 
    labels_ann.loc[labels_ann['PUBLIC_ID'].isin(public_ids_valid),f'{i//5 + 1}_{i%5 + 1}'] = True

In [32]:
labels_ann.groupby(['6_1','6_2','6_3','6_4','6_5']).size()

6_1    6_2    6_3    6_4    6_5  
False  False  False  False  True     133
                     True   False    133
              True   False  False    134
       True   False  False  False    134
True   False  False  False  False    134
dtype: int64

Write public ids to text files

In [33]:
for shuffle in range(1,11):
    for fold in range(1,6):
        labels_ann.loc[labels_ann[f'{shuffle}_{fold}']]['PUBLIC_ID'].to_csv(f'../data/splits/{shuffle}/valid_{fold}.txt',index=False)
        labels_ann.loc[~labels_ann[f'{shuffle}_{fold}']]['PUBLIC_ID'].to_csv(f'../data/splits/{shuffle}/train_{fold}.txt',index=False)

Check the label distribution

In [34]:
labels_ann.groupby(['1_1'])['risk'].value_counts()

1_1    risk
False  0       324
       1       157
       2        53
True   0        81
       1        40
       2        13
Name: count, dtype: int64

In [35]:
labels_ann.groupby(['2_4'])['risk'].value_counts()

2_4    risk
False  0       324
       1       158
       2        53
True   0        81
       1        39
       2        13
Name: count, dtype: int64

# Option 3 - train/valid/test split

In [22]:
labels_full = labels_all.query('PUBLIC_ID in @public_ids_common').reset_index(drop=True) 

In [63]:
labels_dev, labels_test = train_test_split(labels_full, test_size=150, random_state=42950342, stratify=labels_full['risk'])

In [80]:
labels_test.to_csv('../data/splits/test.txt',index=False)
labels_dev.to_csv('../data/splits/dev.txt',index=False)

In [67]:
train_ind, valid_ind = splitter.split(labels_dev['risk'],labels_dev[['risk']],).__iter__().__next__()

In [69]:
labels_dev.iloc[train_ind]['risk'].value_counts()

risk
0    251
1    122
2     41
Name: count, dtype: int64

In [81]:
splitter = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10, random_state=42)

for i, (train_ind, valid_ind) in enumerate(splitter.split(labels_dev[['risk']],labels_dev['risk'])):
    train_labels = labels_dev.iloc[train_ind]
    valid_labels = labels_dev.iloc[valid_ind]
    shuffle = i//5 + 1
    fold = i%5 + 1
    os.makedirs(f'../data/splits/{shuffle}', exist_ok=True)
    train_labels.to_csv(f'../data/splits/{shuffle}/train_{fold}.txt',index=False)
    valid_labels.to_csv(f'../data/splits/{shuffle}/valid_{fold}.txt',index=False)