In [1]:
import json
import random
import os
import shutil

# 讀取原始 JSON 檔案
web_input_file = "../ShowUI-web-8k/metadata/hf_train.json"
mobile_input_file = "../AMEX-8k/metadata/hf_train.json"
desktop_input_file = "../ShowUI-desktop/metadata/hf_train_ori_coord.json"

# 對應的圖片資料夾路徑
mobile_images_dir = "../AMEX-8k/images"
web_images_dir = "../ShowUI-web-8k/images"
desktop_images_dir = "../ShowUI-desktop/images"

output_file = "./metadata/hf_train.json"
images_output_dir = "./images"  # 修正輸出路徑

# 創建圖片輸出目錄
os.makedirs(images_output_dir, exist_ok=True)
os.makedirs("./metadata", exist_ok=True)

# 設定固定隨機種子
random.seed(42)

# 隨機取樣函數
def sample_data(input_file, num_samples):
    with open(input_file, "r", encoding='utf-8') as f:
        data = json.load(f)
    return random.sample(data, min(num_samples, len(data)))

# 複製圖片函數
def copy_images(samples, source_images_dir, dataset_name):
    
    copied_count = 0
    failed_count = 0
    
    for sample in samples:
        # 嘗試不同的圖片字段名稱
        img_url = sample.get("img_url") or sample.get("image") or sample.get("image_path")
        
        if img_url:
            # 構建源圖片路徑
            source_path = os.path.join(source_images_dir, img_url)
            # 構建目標圖片路徑
            target_path = os.path.join(images_output_dir, img_url)
            
            # 確保目標目錄存在
            target_dir = os.path.dirname(target_path)
            os.makedirs(target_dir, exist_ok=True)
            
            try:
                if os.path.exists(source_path):
                    shutil.copy2(source_path, target_path)
                    copied_count += 1
                else:
                    print(f"警告：找不到圖片 {source_path}")
                    failed_count += 1
            except Exception as e:
                print(f"複製圖片失敗 {source_path}: {e}")
                failed_count += 1
        else:
            print(f"警告：樣本中沒有找到圖片路徑字段")
            failed_count += 1
    print(f"{dataset_name} 圖片複製完成：成功 {copied_count} 個，失敗 {failed_count} 個")
    return copied_count, failed_count

# 從不同數據集中採樣
mobile_samples = sample_data(mobile_input_file, 185)
web_samples = sample_data(web_input_file, 185)
desktop_samples = sample_data(desktop_input_file, 30)

# 保存各別的採樣數據
with open("./metadata/mobile_sample_185.json", "w", encoding='utf-8') as f:
    json.dump(mobile_samples, f, indent=2, ensure_ascii=False)

with open("./metadata/web_sample_185.json", "w", encoding='utf-8') as f:
    json.dump(web_samples, f, indent=2, ensure_ascii=False)

with open("./metadata/desktop_sample_30.json", "w", encoding='utf-8') as f:
    json.dump(desktop_samples, f, indent=2, ensure_ascii=False)

# 複製對應的圖片到 Training-data/images 目錄
copy_images(mobile_samples, mobile_images_dir, "Mobile")
copy_images(web_samples, web_images_dir, "Web") 
copy_images(desktop_samples, desktop_images_dir, "Desktop")

# 合併所有採樣的數據
all_samples = mobile_samples + web_samples + desktop_samples

# 將所有採樣的數據存成新的 JSON 檔案
with open(output_file, "w", encoding='utf-8') as f:
    json.dump(all_samples, f, indent=2, ensure_ascii=False)

print(f"\n✅ 處理完成！")
print(f"總計：{len(all_samples)} 個樣本已存到 {output_file}")
print(f"圖片已複製到 {images_output_dir} 目錄")

Mobile 圖片複製完成：成功 185 個，失敗 0 個
Web 圖片複製完成：成功 185 個，失敗 0 個
Desktop 圖片複製完成：成功 30 個，失敗 0 個

✅ 處理完成！
總計：400 個樣本已存到 ./metadata/hf_train.json
圖片已複製到 ./images 目錄
