In [1]:
import csv
import argparse
import zipfile
import pandas as pd
from pathlib import Path
import numpy as np, scipy.sparse as sp
import sys
from shapely.geometry import box, shape, Polygon,mapping

import os, json, h5py
from collections import Counter
import matplotlib.pyplot as plt
import geopandas as gpd
import pyvips
from pathlib import Path

os.chdir('/scratch/users/ntu/lizh0106/nscc_work')
print(os.getcwd())

/scratch/users/ntu/lizh0106/nscc_work


In [2]:
def load_mask_vips(path): 
    img = pyvips.Image.new_from_file(path, access="sequential") 
    arr = np.frombuffer(img.write_to_memory(), dtype=np.uint8) 
    arr = arr.reshape(img.height, img.width, img.bands) 
    arr = arr[..., 0] if arr.ndim == 3 and arr.shape[2] > 1 else arr 
    return (arr > 0).astype(np.uint8)
    
# arr = load_mask_vips("AGGC_Annotation/Subset1_Test_annotation/Subset1_Test_1/Stroma_Mask.tif")
# print(arr.shape)

In [3]:
column_names = ['WSI File Names', 'G3_Mask', 'G4_Mask', 'G5_Mask', 'Normal_Mask', 'Stroma_Mask',
                'primary_gleason_grade','secondary_gleason_grade','gleason_score','ISUP_grade_group']

df = pd.DataFrame(columns=column_names)

In [4]:
MASK_COLUMNS = ['G3_Mask', 'G4_Mask', 'G5_Mask', 'Normal_Mask', 'Stroma_Mask']

def load_mask_vips(path: Path) -> np.ndarray:
    """读取 tif 掩码为 (H, W) 的 0/1 数组（全读内存，稳）。"""
    img = pyvips.Image.new_from_file(str(path), access="sequential")
    arr = np.frombuffer(img.write_to_memory(), dtype=np.uint8)
    arr = arr.reshape(img.height, img.width, img.bands)
    if arr.ndim == 3 and arr.shape[2] > 1:
        arr = arr[..., 0]  # 只取第一通道
    # 统一二值
    arr = (arr > 0).astype(np.uint8)
    return arr  # shape: (H, W), 值 ∈ {0,1}

def scan_annotation_folder_numpy(anno_folder: str, df: pd.DataFrame) -> pd.DataFrame:
    anno_folder = Path(anno_folder)
    rows = []

    for subdir in sorted(p for p in anno_folder.iterdir() if p.is_dir()):
        tifs = sorted(subdir.glob("*.tif"))
        if not tifs:
            continue

        # 读取所有 tif 的尺寸（用 numpy 方式）
        sizes = []
        masks_loaded = {}  # 只存需要的那些 mask 的数组（避免重复读）
        ok = True
        for p in tifs:
            try:
                arr = load_mask_vips(p)
                sizes.append((arr.shape[1], arr.shape[0]))  # (W, H)
                stem = p.stem  # 例如 "G3_Mask"
                if stem in MASK_COLUMNS:
                    masks_loaded[stem] = arr
            except Exception as e:
                print(f"[READFAIL] {subdir.name}: {p.name}: {e}")
                ok = False
                break

        if not ok:
            continue

        # 尺寸一致性检查（所有 tif 宽高必须一致）
        if len(set(sizes)) > 1:
            print(f"[MISMATCH] {subdir.name}: sizes={sorted(set(sizes))}")
            continue

        # 记录一行
        row = {c: np.nan for c in df.columns}
        row['WSI File Names'] = subdir.name

        # 对每个掩码列，若存在对应文件就统计非零像素
        for mask_col in MASK_COLUMNS:
            if mask_col in masks_loaded:
                row[mask_col] = int(np.sum(masks_loaded[mask_col]))  # 因为是0/1，直接sum即像素数

        rows.append(row)

    new_df = pd.DataFrame(rows, columns=df.columns)
    return pd.concat([df, new_df], ignore_index=True)


In [5]:
anno_folder = r"AGGC_Annotation/Subset1_Train_annotation"
df = scan_annotation_folder_numpy(anno_folder, df)
print(df.head())

      WSI File Names      G3_Mask      G4_Mask  G5_Mask  Normal_Mask  \
0    Subset1_Train_1  173260546.0     634859.0      NaN   21093167.0   
1   Subset1_Train_10          NaN  107628662.0      NaN          NaN   
2  Subset1_Train_100          NaN  161595606.0      NaN          NaN   
3  Subset1_Train_101  163468689.0    5011088.0      NaN   55355568.0   
4  Subset1_Train_102    2646743.0   86854930.0      NaN   93403489.0   

   Stroma_Mask  primary_gleason_grade  secondary_gleason_grade  gleason_score  \
0   73414456.0                    NaN                      NaN            NaN   
1  192627690.0                    NaN                      NaN            NaN   
2   39781616.0                    NaN                      NaN            NaN   
3  112591835.0                    NaN                      NaN            NaN   
4  164922487.0                    NaN                      NaN            NaN   

   ISUP_grade_group  
0               NaN  
1               NaN  
2             

  return pd.concat([df, new_df], ignore_index=True)


In [6]:
anno_folder = r"AGGC_Annotation/Subset1_Test_annotation"
df = scan_annotation_folder_numpy(anno_folder, df)
df.head(2)

Unnamed: 0,WSI File Names,G3_Mask,G4_Mask,G5_Mask,Normal_Mask,Stroma_Mask,primary_gleason_grade,secondary_gleason_grade,gleason_score,ISUP_grade_group
0,Subset1_Train_1,173260546.0,634859.0,,21093167.0,73414456.0,,,,
1,Subset1_Train_10,,107628662.0,,,192627690.0,,,,


In [11]:
df.shape

(187, 10)

In [8]:
anno_folder = r"AGGC_Annotation/Subset3_Train_annotation/Akoya"
df = scan_annotation_folder_numpy(anno_folder, df)
df.head(2)

Unnamed: 0,WSI File Names,G3_Mask,G4_Mask,G5_Mask,Normal_Mask,Stroma_Mask,primary_gleason_grade,secondary_gleason_grade,gleason_score,ISUP_grade_group
0,Subset1_Train_1,173260546.0,634859.0,,21093167.0,73414456.0,,,,
1,Subset1_Train_10,,107628662.0,,,192627690.0,,,,


In [10]:
anno_folder = r"AGGC_Annotation/Subset3_Test_annotation/Akoya"
df = scan_annotation_folder_numpy(anno_folder, df)
df.head(2)

Unnamed: 0,WSI File Names,G3_Mask,G4_Mask,G5_Mask,Normal_Mask,Stroma_Mask,primary_gleason_grade,secondary_gleason_grade,gleason_score,ISUP_grade_group
0,Subset1_Train_1,173260546.0,634859.0,,21093167.0,73414456.0,,,,
1,Subset1_Train_10,,107628662.0,,,192627690.0,,,,


In [14]:
import numpy as np
import pandas as pd

# 保障列存在
for c in ['primary_gleason_grade','secondary_gleason_grade','gleason_score','ISUP_grade_group']:
    if c not in df.columns:
        df[c] = np.nan

def pick_primary_secondary(row):
    # 取像素数，NaN->0
    a3 = int(row.get('G3_Mask', 0) if pd.notna(row.get('G3_Mask', np.nan)) else 0)
    a4 = int(row.get('G4_Mask', 0) if pd.notna(row.get('G4_Mask', np.nan)) else 0)
    a5 = int(row.get('G5_Mask', 0) if pd.notna(row.get('G5_Mask', np.nan)) else 0)

    areas = {3: a3, 4: a4, 5: a5}
    nonzero = [(g, areas[g]) for g in (3,4,5) if areas[g] > 0]

    if not nonzero:
        return np.nan, np.nan  # 没有任何掩码，保持缺失

    # 按面积降序；若并列，按模式值降序（高等级优先）
    ranked = sorted(nonzero, key=lambda x: (x[1], x[0]), reverse=True)

    primary = ranked[0][0]
    if len(ranked) >= 2:
        secondary = ranked[1][0]
    else:
        secondary = primary  # 只有一种：primary=secondary

    return int(primary), int(secondary)

def gleason_to_isup(primary, secondary):
    if pd.isna(primary) or pd.isna(secondary):
        return np.nan, np.nan
    primary = int(primary); secondary = int(secondary)
    gs = primary + secondary  # Gleason score

    # ISUP Grade Group（1..5）
    # GG1: 3+3=6
    # GG2: 3+4=7
    # GG3: 4+3=7
    # GG4: 4+4=8 或 3+5=8 或 5+3=8
    # GG5: 9-10（4+5, 5+4, 5+5）
    if primary == 3 and secondary == 3:
        gg = 1
    elif primary == 3 and secondary == 4:
        gg = 2
    elif primary == 4 and secondary == 3:
        gg = 3
    elif gs == 8:  # (4+4) or (3+5) or (5+3)
        gg = 4
    elif gs >= 9:
        gg = 5
    else:
        # 防御式：其余不常见组合（理论上覆盖完了）
        gg = np.nan

    # 机器学习用 0..4
    gg_enc = (gg - 1) if pd.notna(gg) else np.nan
    return gs, gg_enc

# 应用到 df
primaries, secondaries, scores, gg_encs = [], [], [], []
for _, row in df.iterrows():
    p, s = pick_primary_secondary(row)
    primaries.append(p)
    secondaries.append(s)
    gs, gg_enc = gleason_to_isup(p, s)
    scores.append(gs)
    gg_encs.append(gg_enc)

df['primary_gleason_grade'] = primaries
df['secondary_gleason_grade'] = secondaries
df['gleason_score'] = scores
df['ISUP_grade_group'] = gg_encs

# 确保类型为整数列（保留 NaN 时用可空整数类型）
df['primary_gleason_grade'] = df['primary_gleason_grade'].astype('Int64')
df['secondary_gleason_grade'] = df['secondary_gleason_grade'].astype('Int64')
df['gleason_score'] = df['gleason_score'].astype('Int64')
df['ISUP_grade_group'] = df['ISUP_grade_group'].astype('Int64')


In [23]:
df["ISUP_grade_group"].value_counts()

ISUP_grade_group
1    78
2    66
4    22
0    11
3    10
Name: count, dtype: Int64

In [24]:
df.to_csv("WsiBERT/AGGC_metadata.csv")