In [1]:
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("Data")
OUT_SOR   = DATA_ROOT / "processed" / "sorensen"
OUT_MERGE = DATA_ROOT / "processed" / "emphysema_merged"

OUT_MERGE.mkdir(parents=True, exist_ok=True)

# manifest cũ (đã preprocess)
df_sor = pd.read_csv(OUT_SOR / "manifest_slices_preprocessed_clean.csv")

# optional: thêm cột nguồn
df_sor["source"] = "sorensen"

df_sor.head()


Unnamed: 0,slice_key,subject_id,level,label_code,label_name,severity_code,image_path,preprocessed_path,lung_area_px,lung_ratio,hu_min_in_mask,hu_max_in_mask,area_ratio,laa,touch_edge,qc_bad,source
0,subject24_top,24,top,1,NT,0,Data/raw/Emphysema_Database_Sorensen/slices/su...,Data/processed/sorensen/preprocessed/subject24...,87880,0.335236,-990.456726,193.0,0.335236,0.000569,False,False,sorensen
1,subject24_middle,24,middle,1,NT,0,Data/raw/Emphysema_Database_Sorensen/slices/su...,Data/processed/sorensen/preprocessed/subject24...,122592,0.467651,-1000.0,200.0,0.467651,0.003557,False,False,sorensen
2,subject24_bottom,24,bottom,1,NT,0,Data/raw/Emphysema_Database_Sorensen/slices/su...,Data/processed/sorensen/preprocessed/subject24...,109252,0.416763,-1000.0,200.0,0.416763,0.001153,False,False,sorensen
3,subject9_top,9,top,1,NT,0,Data/raw/Emphysema_Database_Sorensen/slices/su...,Data/processed/sorensen/preprocessed/subject9_...,93539,0.356823,-1000.0,200.0,0.356823,0.002544,False,False,sorensen
4,subject9_middle,9,middle,1,NT,0,Data/raw/Emphysema_Database_Sorensen/slices/su...,Data/processed/sorensen/preprocessed/subject9_...,118708,0.452835,-999.20752,200.000015,0.452835,0.001499,False,False,sorensen


In [4]:
label_map = {
    "NT": 0,
    "CLE": 1,
    "PSE": 2,
    "PLE": 3,
}


In [5]:
import numpy as np
from PIL import Image
import os

RAW_WEB_DIR = DATA_ROOT / "raw" / "Emphysema_Web"
WEB_OUT_DIR = OUT_MERGE / "slices_npz_web"
WEB_OUT_DIR.mkdir(parents=True, exist_ok=True)

def preprocess_web_image(in_path, out_path, out_size=512):
    """
    Đọc 1 ảnh web, convert grayscale, resize, min-max normalize về [0,1],
    lưu thành .npz (key: 'image').
    """
    img = Image.open(in_path).convert("L")          # grayscale 8-bit
    img = img.resize((out_size, out_size))         # cho đơn giản: resize thẳng

    arr = np.array(img).astype(np.float32)

    # min-max normalize
    _min, _max = arr.min(), arr.max()
    if _max > _min:
        arr = (arr - _min) / (_max - _min)
    else:
        arr = np.zeros_like(arr, dtype=np.float32)

    # thêm trục channel nếu muốn (C,H,W) -> trong Dataset có thể thêm sau
    # ở đây cứ lưu (H,W) cho giống NPZ cũ
    np.savez_compressed(out_path, image=arr)


In [6]:
rows = []
next_subject_id = 1000   # tránh trùng với subject_id Sorensen (1–39)

for label_name in ["NT", "CLE", "PSE", "PLE"]:
    class_dir = RAW_WEB_DIR / label_name
    if not class_dir.exists():
        continue

    for fname in os.listdir(class_dir):
        if not fname.lower().endswith((".png",".jpg",".jpeg",".tif",".tiff")):
            continue

        in_path  = class_dir / fname

        # tạo key & path lưu
        slice_key = f"web_{label_name}_{fname.rsplit('.',1)[0]}"
        out_fname = f"{slice_key}.npz"
        out_path  = WEB_OUT_DIR / out_fname

        # preprocess & save
        preprocess_web_image(in_path, out_path)

        # ghi 1 hàng vào manifest
        rows.append({
            "slice_key":        slice_key,
            "subject_id":       next_subject_id,   # mỗi ảnh web coi như 1 subject riêng
            "label_name":       label_name,
            "label_code":       label_map[label_name],
            "preprocessed_path": str(out_path),
            "source":           "web",
        })

        next_subject_id += 1  # mỗi ảnh 1 subject_id khác nhau
        

df_web = pd.DataFrame(rows)
df_web.head(), len(df_web)


(       slice_key  subject_id label_name  label_code  \
 0   web_CLE_CLE1        1000        CLE           1   
 1  web_CLE_CLE10        1001        CLE           1   
 2  web_CLE_CLE11        1002        CLE           1   
 3  web_CLE_CLE12        1003        CLE           1   
 4  web_CLE_CLE13        1004        CLE           1   
 
                                    preprocessed_path source  
 0  Data\processed\emphysema_merged\slices_npz_web...    web  
 1  Data\processed\emphysema_merged\slices_npz_web...    web  
 2  Data\processed\emphysema_merged\slices_npz_web...    web  
 3  Data\processed\emphysema_merged\slices_npz_web...    web  
 4  Data\processed\emphysema_merged\slices_npz_web...    web  ,
 46)

In [7]:
rows = []
next_subject_id = 1000   # tránh trùng với subject_id Sorensen (1–39)

for label_name in ["NT", "CLE", "PSE", "PLE"]:
    class_dir = RAW_WEB_DIR / label_name
    if not class_dir.exists():
        continue

    for fname in os.listdir(class_dir):
        if not fname.lower().endswith((".png",".jpg",".jpeg",".tif",".tiff")):
            continue

        in_path  = class_dir / fname

        # tạo key & path lưu
        slice_key = f"web_{label_name}_{fname.rsplit('.',1)[0]}"
        out_fname = f"{slice_key}.npz"
        out_path  = WEB_OUT_DIR / out_fname

        # preprocess & save
        preprocess_web_image(in_path, out_path)

        # ghi 1 hàng vào manifest
        rows.append({
            "slice_key":        slice_key,
            "subject_id":       next_subject_id,   # mỗi ảnh web coi như 1 subject riêng
            "label_name":       label_name,
            "label_code":       label_map[label_name],
            "preprocessed_path": str(out_path),
            "source":           "web",
        })

        next_subject_id += 1  # mỗi ảnh 1 subject_id khác nhau
        

df_web = pd.DataFrame(rows)
df_web.head(), len(df_web)


(       slice_key  subject_id label_name  label_code  \
 0   web_CLE_CLE1        1000        CLE           1   
 1  web_CLE_CLE10        1001        CLE           1   
 2  web_CLE_CLE11        1002        CLE           1   
 3  web_CLE_CLE12        1003        CLE           1   
 4  web_CLE_CLE13        1004        CLE           1   
 
                                    preprocessed_path source  
 0  Data\processed\emphysema_merged\slices_npz_web...    web  
 1  Data\processed\emphysema_merged\slices_npz_web...    web  
 2  Data\processed\emphysema_merged\slices_npz_web...    web  
 3  Data\processed\emphysema_merged\slices_npz_web...    web  
 4  Data\processed\emphysema_merged\slices_npz_web...    web  ,
 46)

In [8]:
# Đảm bảo hai DataFrame có cùng cột
common_cols = ["slice_key","subject_id","label_name","label_code","preprocessed_path","source"]

df_sor2 = df_sor.copy()
if "source" not in df_sor2.columns:
    df_sor2["source"] = "sorensen"

df_sor2 = df_sor2[common_cols]
df_web  = df_web[common_cols]

df_merged = pd.concat([df_sor2, df_web], ignore_index=True)

print(df_merged["source"].value_counts())
print(df_merged["label_name"].value_counts())

# lưu manifest merged
MERGED_MANIFEST = OUT_MERGE / "manifest_merged_preprocessed.csv"
df_merged.to_csv(MERGED_MANIFEST, index=False)

MERGED_MANIFEST



source
sorensen    115
web          46
Name: count, dtype: int64
label_name
NT     61
CLE    43
PSE    37
PLE    20
Name: count, dtype: int64


WindowsPath('Data/processed/emphysema_merged/manifest_merged_preprocessed.csv')