## Generate tissue_position_list

In [6]:
import os 
from glob import glob
import pandas as pd

meta = pd.read_excel('/data/temp/spatial/TRIPLEX/data/test/DRP2/Yale_trastuzumab_response_cohort_metadata_clean.xlsx')
cases = meta['Patient'].astype('str').to_list()

data_path = "/data/temp/spatial/TRIPLEX/data/test/DRP2"
pos_path = f"{data_path}/ST-spotfiles"
os.makedirs(pos_path, exist_ok=True)

samples = glob("./tiles/*")
for sample in cases:
    files = os.listdir(f"./tiles/{sample}/unannotated")
    
    position_abs = []
    for file in files:
        _,_,x,y,_,_,_ = file.split("_")
        pos = int(x[:-1]), int(y[:-1])
        position_abs.append(pos)
        
    position_abs = pd.DataFrame(position_abs)
    position_abs.columns = ["pixel_x","pixel_y"]
    
    position_rel = (position_abs / 200).astype('int')
    position_rel.columns = ["x","y"]
    
    position = pd.merge(position_rel, position_abs, left_index=True, right_index=True)
    sname = sample.split('/')[-1]
    position.to_csv(f"{pos_path}/{sname}_selection.tsv", sep='\t', index=None)
    

In [11]:
from tqdm import tqdm
from glob import glob
import pickle

import pyvips as pv

result = {}
for path in tqdm(glob(f"{data_path}/ST-imgs/*")):
    im = pv.Image.new_from_file(path, level=0)
    
    sname = path.split('/')[-1].split('.')[0]
    result[sname] = im.numpy().shape[:2]
    
with open(file=f'{data_path}/slide_shape.pickle', mode='wb') as f:
    pickle.dump(result, f)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [10:19<00:00,  7.29s/it]


## Preprocess label

### Trastuzumab

In [1]:
import pandas as pd

suppl = pd.read_excel('/data/temp/spatial/TRIPLEX/data/test/DRP2/Yale_trastuzumab_response_cohort_metadata_clean.xlsx')
suppl
# suppl["NAT.regimen"].unique()

Unnamed: 0,Patient,ER,PR,HER2 IHC,HER2,resid inv size (cm),HER2:CEP17 ratio,Her2 CN (signals/cell),CEP17,Responder,Response
0,O09-03495,0,0,2,amp,2.5,,,,nonresponder,positive
1,O10-12717,95,40,2,amp,2.5,3.8,,,nonresponder,positive
2,O14-02301,1,1,2,amp,2,2.8,11.3,4,nonresponder,positive
3,O16-11870,10,1,3,amp,no resid,5.8,18.9,3.3,responder,positive
4,O16-18464,90,90,3,amp,1.2,5,13.3,2.7,nonresponder,positive
...,...,...,...,...,...,...,...,...,...,...,...
80,S18-31022,0,0,2,amp,1.1,4,10.7,2.7,responder,positive
81,S18-32412,0,0,3,,no resid,,,,nonresponder,positive
82,S13-07627,0,0,3,amp,0.1,5.6,,,responder,positive
83,S13-08586,90,20,2,amp,no resid,,,,nonresponder,positive


In [2]:
suppl["NAT.regimen"].str.contains("T-FEC").sum()
suppl["NAT.regimen"].str.contains("FEC-T").sum()
suppl["NAT.regimen"].str.contains("FEC").sum()

KeyError: 'NAT.regimen'

In [5]:
suppl["label"] = suppl["Responder"].map({'responder':1, 'nonresponder':0})

In [3]:
# suppl = suppl[suppl["NAT.regimen"].str.contains("Trastuzumab")][["Donor.ID", "RCB.category"]]
suppl = suppl[suppl["NAT.regimen"].str.contains("T-FEC") | suppl["NAT.regimen"].str.contains("FEC-T")][["Donor.ID", "RCB.category"]]
# suppl = suppl[suppl["NAT.regimen"].str.contains("FEC")][["Donor.ID", "RCB.category"]]
suppl["label"] = suppl["RCB.category"].map({'pCR':1, 'RCB-I':1, 'RCB-II':0, "RCB-III":0})
suppl
# {'pCR':1, 'RCB-I':1, 'RCB-II':0, "RCB-III":0}

KeyError: 'NAT.regimen'

In [35]:
meta = pd.read_excel("data/test/DRP1/slide_metadata.xlsx")

In [36]:
labs = pd.merge(meta, suppl, left_on='Donor.ID', right_on='Donor.ID')

In [38]:
labs = labs[~labs["RCB.category"].isna()]

In [7]:
suppl

Unnamed: 0,Patient,ER,PR,HER2 IHC,HER2,resid inv size (cm),HER2:CEP17 ratio,Her2 CN (signals/cell),CEP17,Responder,Response,label
0,O09-03495,0,0,2,amp,2.5,,,,nonresponder,positive,0
1,O10-12717,95,40,2,amp,2.5,3.8,,,nonresponder,positive,0
2,O14-02301,1,1,2,amp,2,2.8,11.3,4,nonresponder,positive,0
3,O16-11870,10,1,3,amp,no resid,5.8,18.9,3.3,responder,positive,1
4,O16-18464,90,90,3,amp,1.2,5,13.3,2.7,nonresponder,positive,0
...,...,...,...,...,...,...,...,...,...,...,...,...
80,S18-31022,0,0,2,amp,1.1,4,10.7,2.7,responder,positive,1
81,S18-32412,0,0,3,,no resid,,,,nonresponder,positive,0
82,S13-07627,0,0,3,amp,0.1,5.6,,,responder,positive,1
83,S13-08586,90,20,2,amp,no resid,,,,nonresponder,positive,0


In [6]:
X = suppl["Patient"]
y = suppl["label"]

In [8]:
import os
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=10, random_state=41412, shuffle=True)

data_dir = "/home/chungym/project/TransMIL/dataset_csv/drp2"
os.makedirs(data_dir, exist_ok=True)

for i, (train_index, test_index) in enumerate(kf.split(X,y)):
    df_train = pd.concat((X.iloc[train_index], y.iloc[train_index]), axis=1)
    df_test = pd.concat((X.iloc[test_index], y.iloc[test_index]), axis=1)
    
    df = pd.concat((df_train.reset_index(drop=True), df_test.reset_index(drop=True)), axis=1)
    df.columns = ['train', 'train_label','val', 'val_label']
    df.to_csv(f"{data_dir}/fold{i}.csv")