In [None]:
import os
import gc
import cv2
import math
import copy
import time
import tqdm
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage

import glob

import pickle, pandas as pd, numpy as np, os

In [None]:
train_ct_all_pd = pd.read_csv('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/chih_4_fold_covid_train_df.csv')[['path','slice_name']]
train_ct_all_pd['full_path'] = train_ct_all_pd['path']+'/'+train_ct_all_pd['slice_name']
train_ct_all_list = train_ct_all_pd.full_path.values.tolist()
valid_ct_all_pd = pd.read_csv('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/chih_4_fold_covid_valid_df.csv')[['path','slice_name']]
valid_ct_all_pd['full_path'] = valid_ct_all_pd['path']+'/'+valid_ct_all_pd['slice_name']
valid_ct_all_list = valid_ct_all_pd.full_path.values.tolist()
print(train_ct_all_pd.shape, valid_ct_all_pd.shape)

In [None]:
train_area=[]
for path in tqdm(train_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    train_area.append(aaa.sum()/255)

modified_list = [item.replace('train','train_pure_crop_challenge') for item in train_ct_all_list]
train_area_df=pd.DataFrame((zip(modified_list, train_area)), columns = ['path', 'area'])
train_area_df.to_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/train_area_df1_challenge.csv", index=False, encoding='utf-8-sig')
train_area_df
print(train_area_df.shape)



In [None]:
valid_area=[]
for path in tqdm(valid_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    valid_area.append(aaa.sum()/255)

modified_list = [item.replace('val','valid_pure_crop_challenge') for item in valid_ct_all_list]
valid_area_df=pd.DataFrame((zip(modified_list, valid_area)), columns = ['path', 'area'])
valid_area_df.to_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/valid_area_df1_challenge.csv", index=False, encoding='utf-8-sig')ㄣ
valid_area_df
print(valid_area_df.shape)

In [None]:
train_area_df=pd.read_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/train_area_df1_challenge.csv")
valid_area_df=pd.read_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/valid_area_df1_challenge.csv")
print(train_area_df.shape, valid_area_df.shape)

In [None]:
train_area_df["ct_path"]=train_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
valid_area_df["ct_path"]=valid_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

train_area_df["ct_slice"]=train_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))
valid_area_df["ct_slice"]=valid_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

train_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)
valid_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    # print(l, k)
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k

In [None]:
ct_path_list=train_area_df["ct_path"].unique()
train_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=train_area_df[train_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    train_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

ct_path_list=valid_area_df["ct_path"].unique()
valid_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=valid_area_df[valid_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    valid_dic[ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

In [None]:
with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/train_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/valid_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/train_dic1_05_challenge.pickle', 'rb') as f:
    train_dic = pickle.load(f)
with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/valid_dic1_05_challenge.pickle', 'rb') as f:
    valid_dic = pickle.load(f)
print(len(train_dic),len(valid_dic))

In [None]:
valid_dic

In [None]:
train_df = pd.DataFrame(None, columns=['path', 'filename', 'label'])
valid_df = pd.DataFrame(None, columns=['path', 'filename', 'label'])

print("="*10, "loading data DataFrame", "="*10)

# 處理訓練資料
for path in list(train_dic.keys()):
    if not os.path.exists(path):
        print(f"⚠️  跳過不存在的路徑: {path}")
        continue
    
    try:
        image_list = os.listdir(path)
        if len(image_list) == 0:
            print(f"⚠️  跳過空資料夾: {path}")
            continue
            
        image_list.sort(key=lambda x: int(os.path.splitext(x)[0]))
        temp_df = pd.DataFrame([path]*len(image_list), columns=['path'])
        temp_df['filename'] = image_list
        if 'non-covid' in temp_df.path[0]:
            temp_df['label'] = [0]*len(image_list)
        else:
            temp_df['label'] = [1]*len(image_list)
        train_df = pd.concat([train_df, temp_df])
    except Exception as e:
        print(f"❌ 處理路徑時發生錯誤: {path}, 錯誤: {e}")
        continue

# 處理驗證資料
for path in list(valid_dic.keys()):
    if not os.path.exists(path):
        print(f"⚠️  跳過不存在的路徑: {path}")
        continue
    
    try:
        image_list = os.listdir(path)
        if len(image_list) == 0:
            print(f"⚠️  跳過空資料夾: {path}")
            continue
            
        image_list.sort(key=lambda x: int(os.path.splitext(x)[0]))
        temp_df = pd.DataFrame([path]*len(image_list), columns=['path'])
        temp_df['filename'] = image_list
        if 'non-covid' in temp_df.path[0]:
            temp_df['label'] = [0]*len(image_list)
        else:
            temp_df['label'] = [1]*len(image_list)
        valid_df = pd.concat([valid_df, temp_df])
    except Exception as e:
        print(f"❌ 處理路徑時發生錯誤: {path}, 錯誤: {e}")
        continue

print(f"✅ 完成！訓練資料: {train_df.shape[0]} 筆, 驗證資料: {valid_df.shape[0]} 筆")

In [None]:
import copy
cp_train_dic = copy.deepcopy(train_dic)
cp_valid_dic = copy.deepcopy(valid_dic)
print(len(train_dic), len(valid_dic))
print(train_df.shape, valid_df.shape)

In [None]:
drop_count_t = 0
filter_num = 5

# 處理訓練資料
for path_ in train_dic:
    if not os.path.exists(path_):
        print(f"⚠️  跳過不存在的路徑: {path_}")
        continue
    
    try:
        i = len(os.listdir(path_))
        if i < filter_num:
            print(f"張數{i}", path_)
            drop_count_t = drop_count_t + i
            if path_ in cp_train_dic:
                del cp_train_dic[path_]
            train_df = train_df[train_df.path != path_]

    except Exception as e:
        print(f"❌ 處理路徑時發生錯誤: {path_}, 錯誤: {e}")
        continue

drop_count_v = 0

# 處理驗證資料
for path_ in valid_dic:
    if not os.path.exists(path_):
        print(f"⚠️  跳過不存在的路徑: {path_}")
        continue
    
    try:
        i = len(os.listdir(path_))
        if i < filter_num:
            print(f"張數{i}", path_)
            drop_count_v = drop_count_v + i
            if path_ in cp_valid_dic:
                del cp_valid_dic[path_]
            valid_df = valid_df[valid_df.path != path_]

    except Exception as e:
        print(f"❌ 處理路徑時發生錯誤: {path_}, 錯誤: {e}")
        continue

print(len(cp_train_dic), len(cp_valid_dic))
print(train_df.shape, valid_df.shape)
print(drop_count_t, drop_count_v)

In [None]:
train_df = train_df[['path', 'label']]
valid_df = valid_df[['path', 'label']]
train_df = train_df.drop_duplicates(subset='path')
valid_df = valid_df.drop_duplicates(subset='path')
print(train_df.shape, valid_df.shape)

In [11]:
with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/filter_slice_train_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(cp_train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/ssd7/ICCV2025_COVID19/processing_by_hospital_0/filter_slice_valid_dic1_05_challenge.pickle', 'wb') as handle:
    pickle.dump(cp_valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
train_df.to_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/filter_slice_train_df_challenge.csv", index=False, encoding='utf-8-sig')
valid_df.to_csv("/ssd7/ICCV2025_COVID19/processing_by_hospital_0/filter_slice_valid_df_challenge.csv", index=False, encoding='utf-8-sig')