### 將所有的 data 分為 80% 訓練以及 20% 的測試

In [None]:
import pandas as pd
import numpy as np
import os
from config import Args


In [None]:

# 讀取CSV文件
data = pd.read_csv(os.path.join(Args.META_DATA_PATH, 'train_metadata.csv'))

# 獲取所有不重複的SeriesInstanceUID
grouped_data = data.groupby('SeriesInstanceUID')

# 初始化空的DataFrame，用來儲存分割後的資料
train_data = pd.DataFrame(columns=data.columns)
test_data = pd.DataFrame(columns=data.columns)


In [None]:
# 依次處理每個SOPInstanceUID的資料
for _, group in grouped_data:
    # 決定將這個SOPInstanceUID的資料分到哪個集合（80%放入訓練集，20%放入測試集）
    if np.random.rand() < 0.8:
        train_data = pd.concat([train_data, group])
    else:
        test_data = pd.concat([test_data, group])

# 將分割後的資料儲存為CSV檔案
train_data.to_csv(os.path.join(Args.FL_META_DATA_PATH, 'fl_train_meta_data.csv'), index=False)
test_data.to_csv(os.path.join(Args.FL_META_DATA_PATH, 'fl_test_meta_data.csv'), index=False)

 ### 將分好的 80% 訓練資料再切成十等分

In [None]:
import pandas as pd
import os

In [None]:
# 設定要切成幾個等分
total_partitions = 10

In [None]:
data = pd.read_csv(os.path.join(Args.FL_META_DATA_PATH, 'fl_train_meta_data.csv'))
group_sort = data.groupby('SeriesInstanceUID')
partition = int(len(group_sort) / total_partitions)

for i in range(total_partitions):
    globals()['site_meta_'+ str(i+1)] = pd.DataFrame(columns=data.columns)


group_id = 1
for index, (_, group) in enumerate(group_sort):
    if group_id > partition:
        globals()[f'site_meta_'+str(group_id-1)] = pd.concat([globals()['site_meta_'+str(group_id-1)], group])
    else:
        globals()[f'site_meta_'+str(group_id)] = pd.concat([globals()['site_meta_'+str(group_id)], group])
        if (index+1) % total_partitions == 0:
            group_id += 1

for i in range(total_partitions):
    file_name = f'site_meta_{i+1}.csv'
    globals()[f'site_meta_'+str(i+1)].to_csv(os.path.join(Args.FL_SITE_META_DATA_PATH, file_name), index=False)

### 把 label 和切割好的資料對應出來 ( train )

In [None]:
dirs = os.listdir(Args.FL_SITE_META_DATA_PATH)
for index, file in enumerate(dirs):
    print(file)
    # 讀取第一個CSV文件，包含 SeriesInstanceUID 信息
    series_data = pd.read_csv(os.path.join(Args.FL_SITE_META_DATA_PATH, file))

    # 讀取第二個CSV文件，包含Image對應的SOPInstanceUID信息
    image_data = pd.read_csv(os.path.join(Args.TRAIN_CSV_PATH, 'train.csv'))

    grouped_series = series_data.groupby('SeriesInstanceUID')

    result_df = pd.DataFrame(columns=image_data.columns)

    # 將 Image 信息整理到對應的分组中
    for index, group in grouped_series:
        series_uid = index
        sop_instance_ids = list(group['SOPInstanceUID'])
        corresponding_images = image_data[image_data['Image'].isin(sop_instance_ids)]
        result_df = pd.concat([result_df, corresponding_images])

    temp = file.split(".")
    file_name = temp[0].split("_")

    file_path = os.path.join(Args.FL_SITE_CSV_PATH, f"{file_name[0]}-{file_name[2]}.csv")
    # 將最終的結果保存成 csv
    result_df.to_csv(file_path, index=False)

### 把 label 和切割好的資料對應出來 ( test )

In [None]:
series_data = pd.read_csv(os.path.join(Args.FL_META_DATA_PATH, 'fl_test_meta_data.csv'))

image_data = pd.read_csv(os.path.join(Args.TRAIN_CSV_PATH, 'train.csv'))

grouped_series = series_data.groupby('SeriesInstanceUID')
result_df = pd.DataFrame(columns=image_data.columns)

for index, group in grouped_series:
    series_uid = index
    sop_instance_ids = list(group['SOPInstanceUID'])
    corresponding_images = image_data[image_data['Image'].isin(sop_instance_ids)]
    result_df = pd.concat([result_df, corresponding_images])

file_path = os.path.join(Args.FL_TEST_CSV_PATH, "test_data.csv")
result_df.to_csv(file_path, index=False)