In [1]:
import openslide as ops
from glob import glob
import os
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import cv2
import anndata as ad
import pandas as pd
import h5py
import json
from tqdm import tqdm
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [2]:
class_list = {
    0: "epithelial",
    1: "Basal/Myoepithelial",
    2: "Smooth muscle",
    3: "Fibroblast",
    4: "Endothelial",
    5: "Lymphocyte",                # T + B 통합
    6: "Plasma cell",
    7: "Macrophage/Histiocyte",     # 통합
    8: "Neutrophil",
    9: "Adipocyte",
    10: "Other/Unknown"
}
marker_genes = {
    "epithelial": [
        b"EPCAM", b"KRT5", b"KRT14", b"KRT23",
        b"ERBB2", b"MKI67", b"GATA3", b"KRT6B",
        b"CEACAM6", b"CCND1", b"TFAP2A", b"ANKRD30A",
        b"CEACAM8", b"CXCL5", b"CDH1", b"KRT7",
        b"DSP", b"MUC6", b"JUP", b"SCGB2A1"
    ],

    "Basal/Myoepithelial": [
        b"KRT5", b"KRT14", b"ACTA2"
    ],

    "Smooth muscle": [
        b"MYH11", b"ACTA2", b"MYLK"
    ],

    "Fibroblast": [
        b"PDGFRA", b"PDGFRB", b"DPT", b"LUM",
        b"SFRP1", b"FBLN1", b"SFRP4", b"POSTN"
    ],

    "Endothelial": [
        b"PECAM1", b"KDR", b"CD93", b"EGFL7",
        b"VWF", b"CLEC14A", b"MMRN2", b"ESM1"
    ],

    "Lymphocyte": [  # T + B 통합
        b"CD3E", b"CD3G",
        b"GZMA", b"GZMK", b"NKG7", b"CCL5",
        b"TRAC", b"TCF7", b"LTB", b"IL2RG",
        b"CD4", b"CD8A",
        b"CD79A", b"CD79B", b"MS4A1", b"CD19",
        b"CD69", b"CXCR4", b"CCR7", b"SELL"
    ],

    "Plasma cell": [
        b"MZB1", b"PRDM1", b"TNFRSF17", b"SLAMF7"
    ],

    "Macrophage/Histiocyte": [
        b"CD68", b"CD163", b"MRC1", b"C1QA",
        b"AIF1", b"CD14", b"FCGR3A", b"CX3CR1"
    ],

    "Neutrophil": [
        b"S100A8", b"LYZ", b"CEACAM8", b"MMP12"
    ],

    "Adipocyte": [
        b"ADIPOQ", b"LPL", b"PPARG"
    ],

    "Other/Unknown": []
}

class_colors_hex = {
    "epithelial": "#FF0000",        # 빨강
    "Basal/Myoepithelial": "#FFA500",     # 주황
    "Smooth muscle": "#8B4513",           # 갈색
    "Fibroblast": "#00FF00",              # 초록
    "Endothelial": "#0000FF",             # 파랑
    "Lymphocyte": "#FFFF00",              # 노랑 (T/B lymphocyte 통합)
    "Plasma cell": "#9400D3",             # 보라
    "Macrophage/Histiocyte": "#00FFFF",   # 시안(청록)
    "Neutrophil": "#1E90FF",              # DodgerBlue (밝은 파랑)
    "Adipocyte": "#FFC0CB",               # 핑크
    "Other/Unknown": "#808080"            # 회색
}
class_colors = {
    "epithelial": [255, 0, 0],            # 빨강 - 종양 상피
    "Basal/Myoepithelial": [255, 165, 0],       # 주황
    "Smooth muscle": [139, 69, 19],             # 갈색
    "Fibroblast": [0, 255, 0],                  # 초록
    "Endothelial": [0, 0, 255],                 # 파랑
    "Lymphocyte": [255, 255, 0],                # 노랑 (T/B 통합)
    "Plasma cell": [148, 0, 211],               # 보라
    "Macrophage/Histiocyte": [0, 255, 255],     # 시안 (청록)
    "Neutrophil": [30, 144, 255],               # 도저블루 (밝은 파랑)
    "Adipocyte": [255, 192, 203],               # 핑크
    "Other/Unknown": [128, 128, 128]            # 회색
}

class_list_inv = {v: k for k, v in class_list.items()}


In [3]:
wsi_list=glob('../../data/Brest_spatialTranscriptome/preprocessed_xenium/wsis/*.tif')
annotation_list=[f.replace('/wsis','/labels').replace('.tif','.csv') for f in wsi_list]
metadata_list=[f.replace('/wsis','/metadata').replace('.tif','.json') for f in wsi_list]
coord_list=[f.replace('/wsis','/patches').replace('.tif','.h5') for f in wsi_list]


In [4]:

patch_image_size=512
origin_patch_image_size=1024
for i in range(len(wsi_list)):
    slide=ops.OpenSlide(wsi_list[i])
    width, height = slide.dimensions
    metadata_file=metadata_list[i]
    annotation_file=annotation_list[i]
    coord_file=coord_list[i]
    with h5py.File(coord_file, "r") as f:
        coords = f["coords"][:]
    with open(metadata_file, 'r') as f:
        metadata = f.read()
    metadata = json.loads(metadata)
    if metadata['magnification']=='20x':
        magnification=2
    if metadata['magnification']=='40x':
        magnification=1
    else:
        print("Unsupported magnification", metadata['magnification'])
    annotation_df=pd.read_csv(annotation_file)
    x_min=coords[:,0].min()+1000
    y_min=coords[:,1].min()+1000
    x_max=coords[:,0].max()-1000
    y_max=coords[:,1].max()-1000
    tissue_slide=np.array(slide.get_thumbnail((width//(4*2/magnification), height//(4*2/magnification))))
    for row in tqdm(range(height//(origin_patch_image_size//magnification))):
        for col in range(width//(origin_patch_image_size//magnification)):
            if row*(origin_patch_image_size//magnification)<y_min or (row+1)*(origin_patch_image_size//magnification)>y_max or col*(origin_patch_image_size//magnification)<x_min or (col+1)*(origin_patch_image_size//magnification)>x_max:
                continue

            filter_df=annotation_df.loc[(annotation_df['x1']>col*(origin_patch_image_size//magnification)) & (annotation_df['x1']<(col+1)*(origin_patch_image_size//magnification))]
            filter_df=filter_df.loc[(filter_df['y1']>row*(origin_patch_image_size//magnification)) & (filter_df['y1']<(row+1)*(origin_patch_image_size//magnification))]
            if filter_df.shape[0]==0:
                continue
            
            patch=slide.read_region(
                (col*(origin_patch_image_size//magnification), row*(origin_patch_image_size//magnification)),
                0,
                (origin_patch_image_size//magnification, origin_patch_image_size//magnification)
            ).convert("RGB")
            tissue_patch_x=col*(origin_patch_image_size//magnification)-((origin_patch_image_size//magnification))//2-((origin_patch_image_size//magnification))
            tissue_patch_y=row*(origin_patch_image_size//magnification)-((origin_patch_image_size//magnification))//2-((origin_patch_image_size//magnification))
            if tissue_patch_x<0:
                tissue_patch_x=0
            if tissue_patch_y<0:
                tissue_patch_y=0
            if tissue_patch_x+(origin_patch_image_size//magnification)*4>width:
                tissue_patch_x=width - (origin_patch_image_size//magnification)*4
            if tissue_patch_y+(origin_patch_image_size//magnification)*4>height:
                tissue_patch_y=height - (origin_patch_image_size//magnification)*4
            tissue_patch=tissue_slide[tissue_patch_y//(4*2//magnification):tissue_patch_y//(4*2//magnification)+patch_image_size, tissue_patch_x//(4*2//magnification):tissue_patch_x//(4*2//magnification)+patch_image_size, :]
            patch=patch.resize((patch_image_size,patch_image_size))
            pre_df=pd.DataFrame(columns=['x','y','w','h','class'])
            for k in range(len(filter_df)): #x,y,w,h 
                cell_class=filter_df.iloc[k]['class_name']
                if cell_class=='Tumor epithelial' or cell_class=='Non-tumor epithelial':
                    cell_class='epithelial'
                y=int((filter_df.iloc[k]['y1']+filter_df.iloc[k]['y2'])//2 - row*(origin_patch_image_size//magnification))/(origin_patch_image_size//magnification)
                x=int((filter_df.iloc[k]['x1']+filter_df.iloc[k]['x2'])//2 - col*(origin_patch_image_size//magnification))/(origin_patch_image_size//magnification)
                w=int((filter_df.iloc[k]['x2'] - filter_df.iloc[k]['x1']))/(2/magnification)/patch_image_size
                h=int((filter_df.iloc[k]['y2'] - filter_df.iloc[k]['y1']))/(2/magnification)/patch_image_size
                if y>1:
                    y=1
                if x>1:
                    x=1
                pre_df.loc[len(pre_df)] = {'x':x, 'y':y, 'w':w, 'h':h, 'class':class_list_inv[cell_class]}
            save_image_dir=f'../../data/Brest_spatialTranscriptome/preprocessed_xenium/patch_train_data/{os.path.basename(wsi_list[i]).replace(".tif","")}/image/'
            save_annotation_dir=f'../../data/Brest_spatialTranscriptome/preprocessed_xenium/patch_train_data/{os.path.basename(wsi_list[i]).replace(".tif","")}/annotation/'
            save_tissue_dir=f'../../data/Brest_spatialTranscriptome/preprocessed_xenium/patch_train_data/{os.path.basename(wsi_list[i]).replace(".tif","")}/tissue_image/'
            create_dir(save_image_dir)
            create_dir(save_annotation_dir)
            create_dir(save_tissue_dir)
            patch.save(f'{save_image_dir}/patch_{row*(origin_patch_image_size//magnification)}_{col*(origin_patch_image_size//magnification)}.png')
            Image.fromarray(tissue_patch).save(f'{save_tissue_dir}/patch_{row*(origin_patch_image_size//magnification)}_{col*(origin_patch_image_size//magnification)}.png')
            pre_df.to_csv(f'{save_annotation_dir}/patch_{row*(origin_patch_image_size//magnification)}_{col*(origin_patch_image_size//magnification)}.csv', index=False)

100%|██████████| 31/31 [03:16<00:00,  6.33s/it]
100%|██████████| 19/19 [01:34<00:00,  4.97s/it]
100%|██████████| 23/23 [02:32<00:00,  6.64s/it]
100%|██████████| 40/40 [10:09<00:00, 15.23s/it]
100%|██████████| 52/52 [14:10<00:00, 16.36s/it]
100%|██████████| 40/40 [10:29<00:00, 15.74s/it]
100%|██████████| 52/52 [15:06<00:00, 17.42s/it]
100%|██████████| 108/108 [30:43<00:00, 17.07s/it]
100%|██████████| 104/104 [30:33<00:00, 17.63s/it]
