In [None]:
import os
import json
import csv
import pandas as pd
from shutil import copy2
from collections import defaultdict

# ------------------- CONFIG -------------------
ROOT = r"C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA"
IMAGE_ROOT = os.path.join(ROOT, "malaria", "images")
TRAIN_JSON = os.path.join(ROOT, "malaria", "training.json")
TEST_JSON = os.path.join(ROOT, "malaria", "test.json")
OUTPUT_DIR = os.path.join(ROOT, "DATA_SET")
TRAIN_CSV = os.path.join(OUTPUT_DIR, "train.csv")
TEST_CSV = os.path.join(OUTPUT_DIR, "test.csv")
TRAIN_IMG_DIR = os.path.join(OUTPUT_DIR, "train_images")
TEST_IMG_DIR = os.path.join(OUTPUT_DIR, "test_images")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
os.makedirs(TEST_IMG_DIR, exist_ok=True)

# ------------------- STEP 1: JSON → CSV -------------------
def json_to_csv(json_path, csv_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    rows = []
    for item in data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            label = obj['category']
            bbox = obj['bounding_box']
            rows.append([
                image_name, label,
                bbox['minimum']['c'], bbox['minimum']['r'],
                bbox['maximum']['c'], bbox['maximum']['r']
            ])
    with open(csv_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["image_name", "label", "xmin", "ymin", "xmax", "ymax"])
        writer.writerows(rows)
    print(f"✅ STEP 1: CSV created: {csv_path} | Total rows: {len(rows)}")

# ------------------- STEP 2: Validate CSV ↔ JSON (Row-by-row) -------------------
def validate_csv_rows_against_json(json_path, csv_path, image_dir):
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    json_rows = []
    for item in json_data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            bbox = obj['bounding_box']
            json_rows.append({
                "image_name": image_name,
                "label": obj['category'],
                "xmin": bbox['minimum']['c'],
                "ymin": bbox['minimum']['r'],
                "xmax": bbox['maximum']['c'],
                "ymax": bbox['maximum']['r']
            })

    df_json = pd.DataFrame(json_rows)
    df_csv = pd.read_csv(csv_path)

    # ตรวจสอบว่ารูปมีจริง
    df_csv['image_exists'] = df_csv['image_name'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))
    missing_images = df_csv[~df_csv['image_exists']]['image_name'].unique()

    if len(missing_images) > 0:
        print(f"❌ พบ {len(missing_images)} รูปที่ไม่มีอยู่จริงในโฟลเดอร์: {image_dir}")
        print("📛 ตัวอย่าง:", list(missing_images[:5]))
    else:
        print(f"✅ STEP 2.1: ทุกภาพใน {csv_path} มีอยู่จริง")

    # ตรวจสอบ row ใน .csv ว่าตรงกับ .json
    df_csv_valid = df_csv[df_csv['image_exists']].drop(columns=['image_exists'])
    df_merged = df_csv_valid.merge(df_json, how="left", indicator=True)
    mismatched = df_merged[df_merged['_merge'] == 'left_only']

    if not mismatched.empty:
        print(f"❌ STEP 2.2: พบ {len(mismatched)} แถวที่ไม่ตรงกับ JSON:")
        print(mismatched[['image_name', 'label', 'xmin', 'ymin', 'xmax', 'ymax']].head())
    else:
        print(f"✅ STEP 2.2: ทุกแถวใน {csv_path} ตรงกับข้อมูลจาก {json_path}")

# ------------------- STEP 3–4: กรอง .csv ให้เหลือเฉพาะรูปที่มีจริง -------------------
def filter_csv_if_image_missing(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    df['exists'] = df['image_name'].apply(lambda name: os.path.exists(os.path.join(image_dir, name)))
    total = len(df)
    missing = df[~df['exists']]
    df = df[df['exists']].drop(columns='exists')
    df.to_csv(csv_path, index=False)
    print(f"✅ STEP 3–4: กรอง CSV แล้ว: {csv_path}")
    print(f"   → ลบ {len(missing)} แถวที่รูปหายไป (เหลือ: {len(df)}/{total})")

# ------------------- STEP 5: คัดลอกภาพ -------------------
def copy_images_from_csv(csv_path, image_src_dir, image_dst_dir):
    df = pd.read_csv(csv_path)
    copied = 0
    for image_name in df['image_name'].unique():
        src = os.path.join(image_src_dir, image_name)
        dst = os.path.join(image_dst_dir, image_name)
        if os.path.exists(src):
            copy2(src, dst)
            copied += 1
    print(f"✅ STEP 5: คัดลอกภาพ {copied} รูป → {image_dst_dir}")

# ------------------- RUN -------------------
json_to_csv(TRAIN_JSON, TRAIN_CSV)
json_to_csv(TEST_JSON, TEST_CSV)

validate_csv_rows_against_json(TRAIN_JSON, TRAIN_CSV, IMAGE_ROOT)
validate_csv_rows_against_json(TEST_JSON, TEST_CSV, IMAGE_ROOT)

filter_csv_if_image_missing(TRAIN_CSV, IMAGE_ROOT)
filter_csv_if_image_missing(TEST_CSV, IMAGE_ROOT)

copy_images_from_csv(TRAIN_CSV, IMAGE_ROOT, TRAIN_IMG_DIR)
copy_images_from_csv(TEST_CSV, IMAGE_ROOT, TEST_IMG_DIR)

In [None]:
# SELECT_DATA_TRANFORM_2CLASS

import os
import json
import csv
import pandas as pd
from shutil import copy2
from collections import defaultdict

# ------------------- CONFIG -------------------
ROOT = r"C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA"
IMAGE_ROOT = os.path.join(ROOT, "malaria", "images")
TRAIN_JSON = os.path.join(ROOT, "malaria", "training.json")
TEST_JSON = os.path.join(ROOT, "malaria", "test.json")
OUTPUT_DIR = os.path.join(ROOT, "DATA_SET_2CLASS")
TRAIN_CSV = os.path.join(OUTPUT_DIR, "train.csv")
TEST_CSV = os.path.join(OUTPUT_DIR, "test.csv")
TRAIN_IMG_DIR = os.path.join(OUTPUT_DIR, "train_images")
TEST_IMG_DIR = os.path.join(OUTPUT_DIR, "test_images")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
os.makedirs(TEST_IMG_DIR, exist_ok=True)

# ------------------- STEP 1: JSON → CSV (แปลง label เป็น binary) -------------------
def label_to_binary(label: str) -> str:
    return "normal" if label == "red blood cell" else "abnormal"

def json_to_csv(json_path, csv_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    rows = []
    for item in data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            label = obj['category']
            bbox = obj['bounding_box']
            rows.append([
                image_name,
                label_to_binary(label),  # ใช้ label แบบ 2 คลาส
                bbox['minimum']['c'], bbox['minimum']['r'],
                bbox['maximum']['c'], bbox['maximum']['r']
            ])
    with open(csv_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["image_name", "label", "xmin", "ymin", "xmax", "ymax"])
        writer.writerows(rows)
    print(f"✅ STEP 1: CSV created: {csv_path} | Total rows: {len(rows)}")

# ------------------- STEP 2: Validate CSV ↔ JSON -------------------
def validate_csv_rows_against_json(json_path, csv_path, image_dir):
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    json_rows = []
    for item in json_data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            bbox = obj['bounding_box']
            json_rows.append({
                "image_name": image_name,
                "label": label_to_binary(obj['category']),
                "xmin": bbox['minimum']['c'],
                "ymin": bbox['minimum']['r'],
                "xmax": bbox['maximum']['c'],
                "ymax": bbox['maximum']['r']
            })

    df_json = pd.DataFrame(json_rows)
    df_csv = pd.read_csv(csv_path)

    # ตรวจสอบว่ารูปมีจริง
    df_csv['image_exists'] = df_csv['image_name'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))
    missing_images = df_csv[~df_csv['image_exists']]['image_name'].unique()

    if len(missing_images) > 0:
        print(f"❌ พบ {len(missing_images)} รูปที่ไม่มีอยู่จริงในโฟลเดอร์: {image_dir}")
        print("📛 ตัวอย่าง:", list(missing_images[:5]))
    else:
        print(f"✅ STEP 2.1: ทุกภาพใน {csv_path} มีอยู่จริง")

    # ตรวจสอบ row ใน .csv ว่าตรงกับ .json
    df_csv_valid = df_csv[df_csv['image_exists']].drop(columns=['image_exists'])
    df_merged = df_csv_valid.merge(df_json, how="left", indicator=True)
    mismatched = df_merged[df_merged['_merge'] == 'left_only']

    if not mismatched.empty:
        print(f"❌ STEP 2.2: พบ {len(mismatched)} แถวที่ไม่ตรงกับ JSON:")
        print(mismatched[['image_name', 'label', 'xmin', 'ymin', 'xmax', 'ymax']].head())
    else:
        print(f"✅ STEP 2.2: ทุกแถวใน {csv_path} ตรงกับข้อมูลจาก {json_path}")

# ------------------- STEP 3–4: กรอง .csv ให้เหลือเฉพาะรูปที่มีจริง -------------------
def filter_csv_if_image_missing(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    df['exists'] = df['image_name'].apply(lambda name: os.path.exists(os.path.join(image_dir, name)))
    total = len(df)
    missing = df[~df['exists']]
    df = df[df['exists']].drop(columns='exists')
    df.to_csv(csv_path, index=False)
    print(f"✅ STEP 3–4: กรอง CSV แล้ว: {csv_path}")
    print(f"   → ลบ {len(missing)} แถวที่รูปหายไป (เหลือ: {len(df)}/{total})")

# ------------------- STEP 5: คัดลอกภาพ -------------------
def copy_images_from_csv(csv_path, image_src_dir, image_dst_dir):
    df = pd.read_csv(csv_path)
    copied = 0
    for image_name in df['image_name'].unique():
        src = os.path.join(image_src_dir, image_name)
        dst = os.path.join(image_dst_dir, image_name)
        if os.path.exists(src):
            copy2(src, dst)
            copied += 1
    print(f"✅ STEP 5: คัดลอกภาพ {copied} รูป → {image_dst_dir}")

# ------------------- RUN -------------------
json_to_csv(TRAIN_JSON, TRAIN_CSV)
json_to_csv(TEST_JSON, TEST_CSV)

validate_csv_rows_against_json(TRAIN_JSON, TRAIN_CSV, IMAGE_ROOT)
validate_csv_rows_against_json(TEST_JSON, TEST_CSV, IMAGE_ROOT)

filter_csv_if_image_missing(TRAIN_CSV, IMAGE_ROOT)
filter_csv_if_image_missing(TEST_CSV, IMAGE_ROOT)

copy_images_from_csv(TRAIN_CSV, IMAGE_ROOT, TRAIN_IMG_DIR)
copy_images_from_csv(TEST_CSV, IMAGE_ROOT, TEST_IMG_DIR)
