# 02 — Build Cell Dataset (Kaggle‑Ready)

This notebook converts a **Roboflow COCO (Object Detection)** chess dataset into **per‑cell patches (13 classes)** for training a per‑cell CNN.

- **Input (preferred on Kaggle):** `/kaggle/input/chess-pieces-roboflow-coco/` (COCO JSON)
- **Output:** `./data/final/train|val/<CLASS>/*.jpg`
- **Classes:** `Empty, WP, WN, WB, WR, WQ, WK, BP, BN, BB, BR, BQ, BK`

> Runs both **locally** (repo layout) and on **Kaggle**. If paths don't exist, the cell will skip gracefully.


In [4]:
# %%capture
# Basic deps (Kaggle has most preinstalled; keep minimal)
# !pip install --quiet opencv-python pycocotools tqdm pyyaml


In [1]:
import os, json, cv2, shutil, random
from pathlib import Path
from tqdm import tqdm
import numpy as np

# --- Detect if Kaggle ---
ON_KAGGLE = Path('/kaggle').exists()
print("ON_KAGGLE =", ON_KAGGLE)

# --- Paths ---
if ON_KAGGLE:
    # Adjust this to the Roboflow COCO dataset folder name you uploaded to Kaggle Datasets
    ROBO_ROOT = Path('/kaggle/input/chess-pieces-coco')  # <-- change if your dataset slug differs
    # Expected structure inside ROBO_ROOT: train/, valid/, test/, each with _annotations.coco.json
    OUT_ROOT = Path('/kaggle/working')  # writeable
else:
    # local repo layout (relative to this notebook in /notebooks/)
    ROBO_ROOT = Path('../data/public/roboflow_coco')
    OUT_ROOT  = Path('..')

# Output dirs (follow repo config)
CELLS_PUBLIC    = OUT_ROOT / 'data/public/cells'
CELLS_BOOTSTRAP = OUT_ROOT / 'data/bootstrap/cells'
FINAL_TRAIN     = OUT_ROOT / 'data/final/train'
FINAL_VAL       = OUT_ROOT / 'data/final/val'

# Create dirs
for p in [CELLS_PUBLIC, CELLS_BOOTSTRAP, FINAL_TRAIN, FINAL_VAL]:
    os.makedirs(p, exist_ok=True)

# Image size for per-cell patches
IMG_SIZE = 96  # keep in sync with model config

CLASSES_13 = ["Empty","WP","WN","WB","WR","WQ","WK","BP","BN","BB","BR","BQ","BK"]
for c in CLASSES_13[1:]:  # 'Empty' will be populated via bootstrap later
    os.makedirs(CELLS_PUBLIC / c, exist_ok=True)

print("ROBO_ROOT =", ROBO_ROOT.resolve())
print("OUT_ROOT  =", OUT_ROOT.resolve())


ON_KAGGLE = False
ROBO_ROOT = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\public\roboflow_coco
OUT_ROOT  = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition


In [2]:
# --- 1) map ชื่อ + extractor (วางก่อน) ---
NAME_MAP = {
    "white-pawn":"WP","white-rook":"WR","white-knight":"WN","white-bishop":"WB","white-queen":"WQ","white-king":"WK",
    "black-pawn":"BP","black-rook":"BR","black-knight":"BN","black-bishop":"BB","black-queen":"BQ","black-king":"BK",
}

def extract_from_coco(coco_json, imgs_dir, out_root, img_size):
    import json, cv2
    from tqdm import tqdm

    with open(coco_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    id_to_file     = {img["id"]: img["file_name"] for img in data.get("images", [])}
    cat_id_to_name = {c["id"]:  c["name"]        for c   in data.get("categories", [])}

    boxes_by_img = {}
    for ann in data.get("annotations", []):
        if ann.get("iscrowd", 0) not in (0, None): 
            continue
        cls_name_raw = cat_id_to_name.get(ann["category_id"], "")
        mapped = NAME_MAP.get(cls_name_raw)
        if not mapped:
            continue
        x, y, w, h = ann["bbox"]
        boxes_by_img.setdefault(ann["image_id"], []).append((mapped, (x, y, w, h)))

    total = 0
    for img_id, bbox_list in tqdm(boxes_by_img.items(), desc=f"Cropping {coco_json.stem}"):
        fn = id_to_file.get(img_id)
        if not fn: 
            continue
        img = cv2.imread(str(imgs_dir / fn))
        if img is None:
            continue
        H, W = img.shape[:2]
        idx = 0
        for mapped, (x, y, w, h) in bbox_list:
            x1 = max(0, int(round(x))); y1 = max(0, int(round(y)))
            x2 = min(W, int(round(x+w))); y2 = min(H, int(round(y+h)))
            if x2 <= x1 or y2 <= y1: 
                continue
            crop = img[y1:y2, x1:x2]
            if crop.size == 0: 
                continue
            crop = cv2.resize(crop, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
            out_dir = CELLS_PUBLIC / mapped
            out_dir.mkdir(parents=True, exist_ok=True)
            if cv2.imwrite(str(out_dir / f"{img_id}_{idx}.jpg"), crop):
                total += 1; idx += 1
    return total


In [7]:
# COCO category name -> our 12 piece labels
COCO2OUR = {
    "white-pawn":"WP", "white-rook":"WR", "white-knight":"WN", "white-bishop":"WB", "white-queen":"WQ", "white-king":"WK",
    "black-pawn":"BP", "black-rook":"BR", "black-knight":"BN", "black-bishop":"BB", "black-queen":"BQ", "black-king":"BK",
}

def extract_from_coco(coco_json_path: Path, images_dir: Path, out_dir: Path, img_size: int) -> int:
    """Extract bbox crops from a COCO JSON into per-class folders as square patches."""
    if not coco_json_path.exists():
        print(f"[skip] no COCO at {coco_json_path}")
        return 0

    with open(coco_json_path, 'r', encoding='utf-8') as f:
        coco = json.load(f)

    id2file = {im["id"]: im["file_name"] for im in coco.get("images", [])}
    cat_id2name = {c["id"]: c["name"] for c in coco.get("categories", [])}
    anns_by_img = {}
    for ann in coco.get("annotations", []):
        anns_by_img.setdefault(ann["image_id"], []).append(ann)

    saved = 0
    for img_id, anns in tqdm(anns_by_img.items(), desc=f"Extract {images_dir.name}"):
        img_path = images_dir / id2file.get(img_id, "")
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        H, W = img.shape[:2]
        for i, ann in enumerate(anns):
            cname = cat_id2name.get(ann["category_id"], None)
            lab = COCO2OUR.get(cname, None)
            if lab is None:
                continue
            x,y,w,h = ann["bbox"]
            x0,y0 = max(0,int(x)), max(0,int(y))
            x1,y1 = min(W,int(x+w)), min(H,int(y+h))
            if x1<=x0 or y1<=y0: 
                continue
            crop = img[y0:y1, x0:x1]
            crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_AREA)
            fname = f"{img_path.stem}_{i}.jpg"
            cv2.imwrite(str(out_dir / lab / fname), crop)
            saved += 1
    return saved


In [3]:
total = 0
for split in ["train","valid","test"]:
    coco_json = ROBO_ROOT / split / "_annotations.coco.json"
    imgs_dir  = ROBO_ROOT / split
    if coco_json.exists():
        total += extract_from_coco(coco_json, imgs_dir, CELLS_PUBLIC, IMG_SIZE)
    else:
        print(f"[warn] missing split: {split}")
print("Saved patches from Roboflow =", total)


Cropping _annotations.coco: 100%|██████████| 606/606 [00:06<00:00, 97.96it/s] 
Cropping _annotations.coco: 100%|██████████| 58/58 [00:00<00:00, 125.21it/s]
Cropping _annotations.coco: 100%|██████████| 28/28 [00:00<00:00, 67.42it/s]

Saved patches from Roboflow = 7083





## (Optional) Bootstrap from class videos (first-frame → auto labels)

- On **Kaggle**, hidden test videos are not available → this step is **skipped**.
- Locally, put class videos in `../data/public/videos/*.mp4` then run the cell below **after** you have `src/board.py` & `src/cells.py` available.


In [4]:
from Chess_Detection_Competition.utils import load_config
from Chess_Detection_Competition.cells import bootstrap_from_first_frame

# โหลด config เต็มจาก configs/parameters.yaml (มีทั้ง board/cells/train/...)
cfg = load_config()

# ชี้โฟลเดอร์วิดีโอตาม config (จะได้ไม่พังถ้าย้ายที่)
VIDEOS_DIR = OUT_ROOT / cfg["paths"]["videos_dir"]   # เดิมคุณใช้ OUT_ROOT/'data/public/videos'
saved_boot = 0

try:
    if VIDEOS_DIR.exists():
        for v in sorted(VIDEOS_DIR.glob("*.mp4")):
            saved_boot += bootstrap_from_first_frame(
                str(v),
                str(CELLS_BOOTSTRAP),
                cfg,                         # <<<< สำคัญ: ส่ง cfg ทั้งก้อน
            )
        print("Saved bootstrap patches:", saved_boot)
    else:
        print("[skip] no videos dir found:", VIDEOS_DIR)
except Exception:
    import traceback
    print("[skip] bootstrap step error:\n", traceback.format_exc())


Saved bootstrap patches: 320


In [5]:
def copy_merge_split(src_dirs, out_train, out_val, classes, val_ratio=0.1, seed=2025):
    for c in classes:
        os.makedirs(out_train / c, exist_ok=True)
        os.makedirs(out_val / c, exist_ok=True)

    rng = random.Random(seed)
    for c in classes:
        pool = []
        for sd in src_dirs:
            p = Path(sd) / c
            if p.exists():
                pool += [str(p / f) for f in os.listdir(p) if f.lower().endswith(('.jpg','.png'))]
        rng.shuffle(pool)
        n_val = max(1, int(len(pool)*val_ratio))
        val_set = pool[:n_val]
        tr_set  = pool[n_val:]
        for s in tr_set:
            shutil.copy(s, out_train / c / os.path.basename(s))
        for s in val_set:
            shutil.copy(s, out_val / c / os.path.basename(s))
        print(f"{c:>5}  train={len(tr_set):4d}  val={len(val_set):4d}")

# Merge Roboflow crops (+ optional bootstrap) → final train/val
srcs = [CELLS_PUBLIC, CELLS_BOOTSTRAP]
copy_merge_split(srcs, FINAL_TRAIN, FINAL_VAL, CLASSES_13, val_ratio=0.1)
print("\n✅ Done. Final dataset at:") 
print("  ", FINAL_TRAIN.resolve())
print("  ", FINAL_VAL.resolve())


Empty  train= 144  val=  16
   WP  train=2877  val= 319
   WN  train= 841  val=  93
   WB  train= 765  val=  85
   WR  train= 833  val=  92
   WQ  train= 493  val=  54
   WK  train= 648  val=  72
   BP  train=2989  val= 332
   BN  train= 867  val=  96
   BB  train= 607  val=  67
   BR  train= 898  val=  99
   BQ  train= 383  val=  42
   BK  train= 630  val=  70

✅ Done. Final dataset at:
   C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\final\train
   C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\final\val
