In [1]:
import csv
import argparse
import zipfile
import pandas as pd
from pathlib import Path
import numpy as np, scipy.sparse as sp
import sys
from shapely.geometry import box, shape, Polygon,mapping

import os, json, h5py
from collections import Counter
import matplotlib.pyplot as plt
import geopandas as gpd
import pyvips
from pathlib import Path

os.chdir('/scratch/users/ntu/lizh0106/nscc_work')
print(os.getcwd())

/scratch/users/ntu/lizh0106/nscc_work


In [2]:
df = pd.read_csv("WsiBERT/AGGC_metadata.csv")
#df.head(2)

In [3]:
feature_folder = "AGGC_Features/Train_all_features"
mask_folder = "AGGC_Annotation/Annotation_all_file"

tile_size = 512

def read_h5(h5_path):  ##features
  with h5py.File(h5_path, "r") as f:
      coords = f["coords"][:]
      features = f["features"][:]
  return coords, features

def load_mask_vips(path: Path):
    img = pyvips.Image.new_from_file(str(path), access="sequential")
    arr = np.frombuffer(img.write_to_memory(), dtype=np.uint8)
    arr = arr.reshape(img.height, img.width, img.bands)
    arr = (arr > 0).astype(np.uint8)
    if arr.ndim == 3 and arr.shape[2] == 1:
        arr = arr[..., 0]
    return arr
    


In [4]:
targets_list = []
masks_list   = []
names_list   = []

for i, row in df.iterrows():
    wsi_id = row["WSI File Names"]      # for example "Subset1_Train_1"
    print(f"opening {wsi_id}")
    h5_path = os.path.join(feature_folder, wsi_id + ".h5")
    anno_dir = os.path.join(mask_folder, wsi_id)

    coords, features = read_h5(h5_path)

    has_G3 = not pd.isna(row["G3_Mask"])
    has_G4 = not pd.isna(row["G4_Mask"])
    has_G5 = not pd.isna(row["G5_Mask"])

    g3 = g4 = g5 = None

    H = W = None

    if has_G3:
        g3 = load_mask_vips(Path(anno_dir) / "G3_Mask.tif")
        H, W = g3.shape
    if has_G4:
        g4 = load_mask_vips(Path(anno_dir) / "G4_Mask.tif")
        H, W = g4.shape if H is None else (H, W)
    if has_G5:
        g5 = load_mask_vips(Path(anno_dir) / "G5_Mask.tif")
        H, W = g5.shape if H is None else (H, W)

    if H is None:
        print("H = None, NO G3/G4/G5 Annotation")
        continue

    if g3 is None: g3 = np.zeros((H, W), np.uint8)
    if g4 is None: g4 = np.zeros((H, W), np.uint8)
    if g5 is None: g5 = np.zeros((H, W), np.uint8)

    tumor_union = (g3 | g4 | g5).astype(np.uint8)
    other = (1 - tumor_union).astype(np.uint8)

    out_of_bound = 0

    for (x, y) in coords:  
        x = int(x); y = int(y)
        if x + tile_size > W or y + tile_size > H:
            # out of bound = other class
            t = [0., 0., 0., 1.]
            m = [0, 0, 0, 1]
            out_of_bound += 1
        else:
            patch_g3 = g3[y:y+tile_size, x:x+tile_size]
            patch_g4 = g4[y:y+tile_size, x:x+tile_size]
            patch_g5 = g5[y:y+tile_size, x:x+tile_size]
            patch_ot = other[y:y+tile_size, x:x+tile_size]

            area = tile_size * tile_size

            r3 = float(patch_g3.sum()) / area
            r4 = float(patch_g4.sum()) / area
            r5 = float(patch_g5.sum()) / area
            ro = float(patch_ot.sum()) / area

            t = [r3, r4, r5, ro]

            m = [int(has_G3),int(has_G4), int(has_G5),1]

        targets_list.append(t)
        masks_list.append(m)
        names_list.append(wsi_id)

    print(f"{wsi_id}: {out_of_bound} tiles out of mask range and was assigned to the other class")

targets = np.array(targets_list, dtype=np.float32) 
masks   = np.array(masks_list,   dtype=np.float32)
names   = np.array(names_list,   dtype="U32")

np.savez_compressed(
    "WsiBERT/AGGC_tile_targets_masks_names.npz",
    targets=targets,
    masks=masks,
    names=names,)



opening Subset1_Train_1
Subset1_Train_1: 100 tiles out of mask range and was assigned to the other class
opening Subset1_Train_10
Subset1_Train_10: 28 tiles out of mask range and was assigned to the other class
opening Subset1_Train_100
Subset1_Train_100: 0 tiles out of mask range and was assigned to the other class
opening Subset1_Train_101
Subset1_Train_101: 0 tiles out of mask range and was assigned to the other class
opening Subset1_Train_102
Subset1_Train_102: 12 tiles out of mask range and was assigned to the other class
opening Subset1_Train_103
Subset1_Train_103: 51 tiles out of mask range and was assigned to the other class
opening Subset1_Train_104
Subset1_Train_104: 62 tiles out of mask range and was assigned to the other class
opening Subset1_Train_105
Subset1_Train_105: 0 tiles out of mask range and was assigned to the other class
opening Subset1_Train_11
Subset1_Train_11: 49 tiles out of mask range and was assigned to the other class
opening Subset1_Train_12
Subset1_Train

In [6]:
tmp = np.load("WsiBERT/AGGC_tile_targets_masks_names.npz")
tmp["targets"].shape