<a href="https://colab.research.google.com/github/Kolo-Naukowe-Axion/Angiography/blob/main/arcade_cadica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import glob
import json
import shutil
import subprocess
import pandas as pd
import cv2
from pathlib import Path
from tqdm.auto import tqdm
import kagglehub

# ==========================================
# 0. SETUP & HELPER FUNCTIONS
# ==========================================
print("--- INITIALIZING FUSION PIPELINE ---")

# 1. Folder with all data (background + stenosis)
DIR_ALL = Path('fused_dataset_all')
IMG_ALL = DIR_ALL / 'images'
LBL_ALL = DIR_ALL / 'labels'

# 2. Folder with stenosis only
DIR_STENOSIS = Path('fused_dataset_stenosis_only')
IMG_STENOSIS = DIR_STENOSIS / 'images'
LBL_STENOSIS = DIR_STENOSIS / 'labels'

# 3. Folder with background only (NO stenosis)
DIR_NO_STENOSIS = Path('fused_dataset_no_stenosis')
IMG_NO_STENOSIS = DIR_NO_STENOSIS / 'images'
LBL_NO_STENOSIS = DIR_NO_STENOSIS / 'labels'

# Create all necessary folders
for p in [IMG_ALL, LBL_ALL, IMG_STENOSIS, LBL_STENOSIS, IMG_NO_STENOSIS, LBL_NO_STENOSIS]:
    p.mkdir(parents=True, exist_ok=True)

def run_cmd(cmd):
    subprocess.run(cmd, shell=True, check=True)

def to_yolo(img_w, img_h, x_min, y_min, box_w, box_h):
    x_center = (x_min + (box_w / 2.0)) / img_w
    y_center = (y_min + (box_h / 2.0)) / img_h
    norm_w = box_w / img_w
    norm_h = box_h / img_h
    return x_center, y_center, norm_w, norm_h

def save_image_and_labels(src_img_path, img_target_dir, lbl_target_dir, new_img_name, new_lbl_name, annotations):
    """Helper function to save image and its YOLO labels to specific directories."""
    shutil.copy(src_img_path, img_target_dir / new_img_name)
    with open(lbl_target_dir / new_lbl_name, 'w') as f_out:
        f_out.writelines(annotations)

# ==========================================
# 1. DOWNLOAD & PROCESS CADICA
# ==========================================
print("\n--- PROCESSING CADICA DATASET ---")
cadica_link = "https://data.mendeley.com/public-api/zip/p9bpx9ctcv/download/5"
cadica_zip = "cadica_outer.zip"

if not os.path.exists("selectedVideos"):
    print("Downloading CADICA...")
    run_cmd(f"wget -q --show-progress -O {cadica_zip} {cadica_link}")
    run_cmd(f"unzip -q {cadica_zip}")
    os.remove(cadica_zip)
    inner_zips = glob.glob('**/*.zip', recursive=True)
    if inner_zips:
        run_cmd(f"unzip -q '{inner_zips[0]}'")
        os.remove(inner_zips[0])

cadica_roots = glob.glob('**/selectedVideos', recursive=True)
if cadica_roots:
    cadica_path = cadica_roots[0]
    patients = [p for p in os.listdir(cadica_path) if os.path.isdir(os.path.join(cadica_path, p))]
    cadica_tasks = []

    for p_id in patients:
        p_path = os.path.join(cadica_path, p_id)
        videos = [v for v in os.listdir(p_path) if os.path.isdir(os.path.join(p_path, v))]

        for v_id in videos:
            input_path = os.path.join(p_path, v_id, 'input')
            gt_path = os.path.join(p_path, v_id, 'groundtruth')
            if not os.path.exists(input_path): continue

            for frame_path in glob.glob(os.path.join(input_path, '*.*')):
                frame_name = os.path.basename(frame_path)
                gt_file = os.path.join(gt_path, f"{os.path.splitext(frame_name)[0]}.txt")
                if not os.path.exists(gt_file):
                    gt_file = None
                cadica_tasks.append((frame_path, gt_file, p_id, v_id, frame_name))

    for frame_path, gt_file, p_id, v_id, frame_name in tqdm(cadica_tasks, desc="CADICA All Images"):
        img = cv2.imread(frame_path)
        if img is None: continue
        h, w, _ = img.shape

        new_img_name = f"cadica_{p_id}_{v_id}_{frame_name}"
        new_lbl_name = new_img_name.rsplit('.', 1)[0] + '.txt'

        # 1. Parse annotations first
        annotations = []
        if gt_file is not None and os.path.exists(gt_file):
            with open(gt_file, 'r') as f_in:
                for line in f_in.readlines():
                    parts = line.strip().split()
                    if len(parts) >= 4:
                        try:
                            x, y, bw, bh = [float(val) for val in parts[:4]]
                            xc, yc, nw, nh = to_yolo(w, h, x, y, bw, bh)
                            annotations.append(f"0 {xc:.6f} {yc:.6f} {nw:.6f} {nh:.6f}\n")
                        except ValueError: continue

        # 2. Check if the image contains stenosis
        has_stenosis = len(annotations) > 0

        # 3. Save to the ALL folder
        save_image_and_labels(frame_path, IMG_ALL, LBL_ALL, new_img_name, new_lbl_name, annotations)

        # 4. Save to the respective STENOSIS / NO_STENOSIS folders
        if has_stenosis:
            save_image_and_labels(frame_path, IMG_STENOSIS, LBL_STENOSIS, new_img_name, new_lbl_name, annotations)
        else:
            save_image_and_labels(frame_path, IMG_NO_STENOSIS, LBL_NO_STENOSIS, new_img_name, new_lbl_name, annotations)


In [None]:
# ==========================================
# 2. PROCESS ARCADE (ROBUST VERSION)
# ==========================================
print("\n--- PROCESSING ARCADE DATASET ---")

import kagglehub
arcade_base = kagglehub.dataset_download("nikitamanaenkov/annotated-x-ray-angiography-dataset")
print(f"Path to dataset files: {arcade_base}")

# 1. Dynamically find all JSON files instead of hardcoding the paths
json_paths = glob.glob(os.path.join(arcade_base, '**', '*.json'), recursive=True)

if not json_paths:
    print("-> [ERROR] Could not find ANY .json files in the ARCADE dataset! Check the download path.")

for json_path in json_paths:
    # Deduce split name from the json filename (e.g., 'train.json' -> 'train')
    split = os.path.basename(json_path).replace('.json', '')
    print(f"\n-> Loading annotations from: {json_path}")

    # Deduce the image directory. Usually: parent_dir/annotations/train.json -> parent_dir/images
    parent_dir = os.path.dirname(os.path.dirname(json_path))
    img_dir = os.path.join(parent_dir, 'images')

    if not os.path.exists(img_dir):
        print(f"-> [WARNING] Expected image folder not found at: {img_dir}. Skipping this split.")
        continue

    with open(json_path, 'r') as f:
        coco = json.load(f)

    img_dict = {img['id']: {'name': img['file_name'], 'w': img['width'], 'h': img['height']} for img in coco['images']}

    ann_dict = {}
    for ann in coco.get('annotations', []):
        ann_dict.setdefault(ann['image_id'], []).append(ann)

    missing_images = 0
    processed_count = 0

    for img_id, img_info in tqdm(img_dict.items(), desc=f"ARCADE {split.capitalize()}"):
        # os.path.basename fixes issues where COCO file_name is listed as "images/img.png"
        clean_img_name = os.path.basename(img_info['name'])
        orig_img_path = Path(img_dir) / clean_img_name

        if not orig_img_path.exists():
            missing_images += 1
            continue

        new_img_name = f"arcade_{split}_{clean_img_name}"
        new_lbl_name = new_img_name.rsplit('.', 1)[0] + '.txt'

        # Parse annotations
        annotations = []
        if img_id in ann_dict:
            for ann in ann_dict[img_id]:
                # COCO bbox is [x_min, y_min, width, height]
                x_min, y_min, bw, bh = ann['bbox']
                xc, yc, nw, nh = to_yolo(img_info['w'], img_info['h'], x_min, y_min, bw, bh)
                annotations.append(f"0 {xc:.6f} {yc:.6f} {nw:.6f} {nh:.6f}\n")

        has_stenosis = len(annotations) > 0

        # Save to ALL
        save_image_and_labels(orig_img_path, IMG_ALL, LBL_ALL, new_img_name, new_lbl_name, annotations)

        # Save to STENOSIS / NO_STENOSIS
        if has_stenosis:
            save_image_and_labels(orig_img_path, IMG_STENOSIS, LBL_STENOSIS, new_img_name, new_lbl_name, annotations)
        else:
            save_image_and_labels(orig_img_path, IMG_NO_STENOSIS, LBL_NO_STENOSIS, new_img_name, new_lbl_name, annotations)

        processed_count += 1

    if missing_images > 0:
        print(f"-> [WARNING] Skipped {missing_images} images because they could not be found on disk.")
    print(f"-> Successfully processed {processed_count} images for {split}.")

In [None]:
import os
from pathlib import Path

# Define the dataset directories
directories = {
    "All Data (Background + Stenosis)": Path('fused_dataset_all'),
    "Stenosis Only": Path('fused_dataset_stenosis_only'),
    "No Stenosis (Background Only)": Path('fused_dataset_no_stenosis')
}

print("========================================")
print("          DATASET SUMMARY")
print("========================================")

for name, base_path in directories.items():
    img_dir = base_path / 'images'
    lbl_dir = base_path / 'labels'

    # Check if directories exist to avoid errors
    if not base_path.exists():
        print(f"\n{name}:")
        print(f"  -> [ERROR] Directory '{base_path}' does not exist.")
        continue

    # Count files (images can be any extension, labels should be .txt)
    num_images = len([f for f in img_dir.iterdir() if f.is_file()]) if img_dir.exists() else 0
    num_labels = len(list(lbl_dir.glob('*.txt'))) if lbl_dir.exists() else 0

    print(f"\n{name}:")
    print(f"  -> Path:   {base_path}")
    print(f"  -> Images: {num_images}")
    print(f"  -> Labels: {num_labels}")

    # Sanity check: Ensure every image has a corresponding label file
    if num_images != num_labels:
        print(f"  -> [WARNING] Mismatch! {num_images} images vs {num_labels} labels.")

print("\n========================================")

In [None]:
import shutil
import os

folders = [
    'fused_dataset_all',
    'fused_dataset_stenosis_only',
]

print("--- COMPRESSING DATASETS ---")
for folder in folders:
    if os.path.exists(folder):
        print(f"Zipping {folder}...")
        # This creates a .zip file of the folder
        shutil.make_archive(folder, 'zip', folder)
        print(f" -> Created {folder}.zip")
    else:
        print(f"[WARNING] {folder} does not exist.")

print("\n--- DONE! ---")

from google.colab import files

for folder in folders:
    if os.path.exists(f"{folder}.zip"):
        print(f"Downloading {folder}.zip...")
        files.download(f"{folder}.zip")