In [7]:
import os
import json
import csv
import pandas as pd
from shutil import copy2
from collections import defaultdict

# ------------------- CONFIG -------------------
ROOT = r"C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA"
IMAGE_ROOT = os.path.join(ROOT, "malaria", "images")
TRAIN_JSON = os.path.join(ROOT, "malaria", "training.json")
TEST_JSON = os.path.join(ROOT, "malaria", "test.json")
OUTPUT_DIR = os.path.join(ROOT, "DATA_SET")
TRAIN_CSV = os.path.join(OUTPUT_DIR, "train.csv")
TEST_CSV = os.path.join(OUTPUT_DIR, "test.csv")
TRAIN_IMG_DIR = os.path.join(OUTPUT_DIR, "train_images")
TEST_IMG_DIR = os.path.join(OUTPUT_DIR, "test_images")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TRAIN_IMG_DIR, exist_ok=True)
os.makedirs(TEST_IMG_DIR, exist_ok=True)

# ------------------- STEP 1: JSON ‚Üí CSV -------------------
def json_to_csv(json_path, csv_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    rows = []
    for item in data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            label = obj['category']
            bbox = obj['bounding_box']
            rows.append([
                image_name, label,
                bbox['minimum']['c'], bbox['minimum']['r'],
                bbox['maximum']['c'], bbox['maximum']['r']
            ])
    with open(csv_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["image_name", "label", "xmin", "ymin", "xmax", "ymax"])
        writer.writerows(rows)
    print(f"‚úÖ STEP 1: CSV created: {csv_path} | Total rows: {len(rows)}")

# ------------------- STEP 2: Validate CSV ‚Üî JSON (Row-by-row) -------------------
def validate_csv_rows_against_json(json_path, csv_path, image_dir):
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    json_rows = []
    for item in json_data:
        image_name = os.path.basename(item['image']['pathname'])
        for obj in item['objects']:
            bbox = obj['bounding_box']
            json_rows.append({
                "image_name": image_name,
                "label": obj['category'],
                "xmin": bbox['minimum']['c'],
                "ymin": bbox['minimum']['r'],
                "xmax": bbox['maximum']['c'],
                "ymax": bbox['maximum']['r']
            })

    df_json = pd.DataFrame(json_rows)
    df_csv = pd.read_csv(csv_path)

    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤‡∏£‡∏π‡∏õ‡∏°‡∏µ‡∏à‡∏£‡∏¥‡∏á
    df_csv['image_exists'] = df_csv['image_name'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))
    missing_images = df_csv[~df_csv['image_exists']]['image_name'].unique()

    if len(missing_images) > 0:
        print(f"‚ùå ‡∏û‡∏ö {len(missing_images)} ‡∏£‡∏π‡∏õ‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå: {image_dir}")
        print("üìõ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á:", list(missing_images[:5]))
    else:
        print(f"‚úÖ STEP 2.1: ‡∏ó‡∏∏‡∏Å‡∏†‡∏≤‡∏û‡πÉ‡∏ô {csv_path} ‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á")

    # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö row ‡πÉ‡∏ô .csv ‡∏ß‡πà‡∏≤‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö .json
    df_csv_valid = df_csv[df_csv['image_exists']].drop(columns=['image_exists'])
    df_merged = df_csv_valid.merge(df_json, how="left", indicator=True)
    mismatched = df_merged[df_merged['_merge'] == 'left_only']

    if not mismatched.empty:
        print(f"‚ùå STEP 2.2: ‡∏û‡∏ö {len(mismatched)} ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö JSON:")
        print(mismatched[['image_name', 'label', 'xmin', 'ymin', 'xmax', 'ymax']].head())
    else:
        print(f"‚úÖ STEP 2.2: ‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß‡πÉ‡∏ô {csv_path} ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å {json_path}")

# ------------------- STEP 3‚Äì4: ‡∏Å‡∏£‡∏≠‡∏á .csv ‡πÉ‡∏´‡πâ‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏£‡∏π‡∏õ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏à‡∏£‡∏¥‡∏á -------------------
def filter_csv_if_image_missing(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    df['exists'] = df['image_name'].apply(lambda name: os.path.exists(os.path.join(image_dir, name)))
    total = len(df)
    missing = df[~df['exists']]
    df = df[df['exists']].drop(columns='exists')
    df.to_csv(csv_path, index=False)
    print(f"‚úÖ STEP 3‚Äì4: ‡∏Å‡∏£‡∏≠‡∏á CSV ‡πÅ‡∏•‡πâ‡∏ß: {csv_path}")
    print(f"   ‚Üí ‡∏•‡∏ö {len(missing)} ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏£‡∏π‡∏õ‡∏´‡∏≤‡∏¢‡πÑ‡∏õ (‡πÄ‡∏´‡∏•‡∏∑‡∏≠: {len(df)}/{total})")

# ------------------- STEP 5: ‡∏Ñ‡∏±‡∏î‡∏•‡∏≠‡∏Å‡∏†‡∏≤‡∏û -------------------
def copy_images_from_csv(csv_path, image_src_dir, image_dst_dir):
    df = pd.read_csv(csv_path)
    copied = 0
    for image_name in df['image_name'].unique():
        src = os.path.join(image_src_dir, image_name)
        dst = os.path.join(image_dst_dir, image_name)
        if os.path.exists(src):
            copy2(src, dst)
            copied += 1
    print(f"‚úÖ STEP 5: ‡∏Ñ‡∏±‡∏î‡∏•‡∏≠‡∏Å‡∏†‡∏≤‡∏û {copied} ‡∏£‡∏π‡∏õ ‚Üí {image_dst_dir}")

# ------------------- RUN -------------------
json_to_csv(TRAIN_JSON, TRAIN_CSV)
json_to_csv(TEST_JSON, TEST_CSV)

validate_csv_rows_against_json(TRAIN_JSON, TRAIN_CSV, IMAGE_ROOT)
validate_csv_rows_against_json(TEST_JSON, TEST_CSV, IMAGE_ROOT)

filter_csv_if_image_missing(TRAIN_CSV, IMAGE_ROOT)
filter_csv_if_image_missing(TEST_CSV, IMAGE_ROOT)

copy_images_from_csv(TRAIN_CSV, IMAGE_ROOT, TRAIN_IMG_DIR)
copy_images_from_csv(TEST_CSV, IMAGE_ROOT, TEST_IMG_DIR)

‚úÖ STEP 1: CSV created: C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\DATA_SET\train.csv | Total rows: 80113
‚úÖ STEP 1: CSV created: C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\DATA_SET\test.csv | Total rows: 5922
‚ùå ‡∏û‡∏ö 1 ‡∏£‡∏π‡∏õ‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå: C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\malaria\images
üìõ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á: ['8d02117d-6c71-4e47-b50a-6cc8d5eb1d55.png']
‚úÖ STEP 2.2: ‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß‡πÉ‡∏ô C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\DATA_SET\train.csv ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\malaria\training.json
‚úÖ STEP 2.1: ‡∏ó‡∏∏‡∏Å‡∏†‡∏≤‡∏û‡πÉ‡∏ô C:\Users\BMEi\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_MALARIA\DATA_SET\test.csv ‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á
‚úÖ STEP 2.2: ‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß‡πÉ‡∏ô C:\Users\BMEi\