# 02 ‚Äî Build Cell Dataset (Kaggle‚ÄëReady)

This notebook converts a **Roboflow COCO (Object Detection)** chess dataset into **per‚Äëcell patches (13 classes)** for training a per‚Äëcell CNN.

- **Input (preferred on Kaggle):** `/kaggle/input/chess-pieces-roboflow-coco/` (COCO JSON)
- **Output:** `./data/final/train|val/<CLASS>/*.jpg`
- **Classes:** `Empty, WP, WN, WB, WR, WQ, WK, BP, BN, BB, BR, BQ, BK`

> Runs both **locally** (repo layout) and on **Kaggle**. If paths don't exist, the cell will skip gracefully.


In [1]:
# %%capture
# Basic deps (Kaggle has most preinstalled; keep minimal)
# !pip install --quiet opencv-python pycocotools tqdm pyyaml


In [2]:
import os, json, cv2, shutil, random
from pathlib import Path
from tqdm import tqdm
import numpy as np

# --- Detect if Kaggle ---
ON_KAGGLE = Path('/kaggle').exists()
print("ON_KAGGLE =", ON_KAGGLE)

# --- Paths ---
if ON_KAGGLE:
    # Adjust this to the Roboflow COCO dataset folder name you uploaded to Kaggle Datasets
    ROBO_ROOT = Path('/kaggle/input/chess-pieces-coco')  # <-- change if your dataset slug differs
    # Expected structure inside ROBO_ROOT: train/, valid/, test/, each with _annotations.coco.json
    OUT_ROOT = Path('/kaggle/working')  # writeable
else:
    # local repo layout (relative to this notebook in /notebooks/)
    ROBO_ROOT = Path('../data/public/roboflow_coco')
    OUT_ROOT  = Path('..')

# Output dirs (follow repo config)
CELLS_PUBLIC    = OUT_ROOT / 'data/public/cells'
CELLS_BOOTSTRAP = OUT_ROOT / 'data/bootstrap/cells'
FINAL_TRAIN     = OUT_ROOT / 'data/final/train'
FINAL_VAL       = OUT_ROOT / 'data/final/val'

# Create dirs
for p in [CELLS_PUBLIC, CELLS_BOOTSTRAP, FINAL_TRAIN, FINAL_VAL]:
    os.makedirs(p, exist_ok=True)

# Image size for per-cell patches
IMG_SIZE = 96  # keep in sync with model config

# Cell: ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö 13 classes
CLASSES_13 = ["Empty","WP","WN","WB","WR","WQ","WK","BP","BN","BB","BR","BQ","BK"]

# ‚úÖ ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î (‡∏£‡∏ß‡∏° Empty)
for c in CLASSES_13:
    os.makedirs(CELLS_PUBLIC / c, exist_ok=True)
    os.makedirs(CELLS_BOOTSTRAP / c, exist_ok=True)

print("‚úÖ Created folders for 13 classes (including Empty)")

print("ROBO_ROOT =", ROBO_ROOT.resolve())
print("OUT_ROOT  =", OUT_ROOT.resolve())


ON_KAGGLE = False
‚úÖ Created folders for 13 classes (including Empty)
ROBO_ROOT = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\public\roboflow_coco
OUT_ROOT  = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition


In [3]:
# --- 1) map ‡∏ä‡∏∑‡πà‡∏≠ + extractor (‡∏ß‡∏≤‡∏á‡∏Å‡πà‡∏≠‡∏ô) ---
NAME_MAP = {
    "white-pawn":"WP","white-rook":"WR","white-knight":"WN","white-bishop":"WB","white-queen":"WQ","white-king":"WK",
    "black-pawn":"BP","black-rook":"BR","black-knight":"BN","black-bishop":"BB","black-queen":"BQ","black-king":"BK",
}

def extract_from_coco(coco_json, imgs_dir, out_root, img_size):
    import json, cv2
    from tqdm import tqdm

    with open(coco_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    id_to_file     = {img["id"]: img["file_name"] for img in data.get("images", [])}
    cat_id_to_name = {c["id"]:  c["name"]        for c   in data.get("categories", [])}

    boxes_by_img = {}
    for ann in data.get("annotations", []):
        if ann.get("iscrowd", 0) not in (0, None): 
            continue
        cls_name_raw = cat_id_to_name.get(ann["category_id"], "")
        mapped = NAME_MAP.get(cls_name_raw)
        if not mapped:
            continue
        x, y, w, h = ann["bbox"]
        boxes_by_img.setdefault(ann["image_id"], []).append((mapped, (x, y, w, h)))

    total = 0
    for img_id, bbox_list in tqdm(boxes_by_img.items(), desc=f"Cropping {coco_json.stem}"):
        fn = id_to_file.get(img_id)
        if not fn: 
            continue
        img = cv2.imread(str(imgs_dir / fn))
        if img is None:
            continue
        H, W = img.shape[:2]
        idx = 0
        for mapped, (x, y, w, h) in bbox_list:
            x1 = max(0, int(round(x))); y1 = max(0, int(round(y)))
            x2 = min(W, int(round(x+w))); y2 = min(H, int(round(y+h)))
            if x2 <= x1 or y2 <= y1: 
                continue
            crop = img[y1:y2, x1:x2]
            if crop.size == 0: 
                continue
            crop = cv2.resize(crop, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
            out_dir = CELLS_PUBLIC / mapped
            out_dir.mkdir(parents=True, exist_ok=True)
            if cv2.imwrite(str(out_dir / f"{img_id}_{idx}.jpg"), crop):
                total += 1; idx += 1
    return total


In [4]:
# COCO category name -> our 12 piece labels
COCO2OUR = {
    "white-pawn":"WP", "white-rook":"WR", "white-knight":"WN", "white-bishop":"WB", "white-queen":"WQ", "white-king":"WK",
    "black-pawn":"BP", "black-rook":"BR", "black-knight":"BN", "black-bishop":"BB", "black-queen":"BQ", "black-king":"BK",
}

def extract_from_coco(coco_json_path: Path, images_dir: Path, out_dir: Path, img_size: int) -> int:
    """Extract bbox crops from a COCO JSON into per-class folders as square patches."""
    if not coco_json_path.exists():
        print(f"[skip] no COCO at {coco_json_path}")
        return 0

    with open(coco_json_path, 'r', encoding='utf-8') as f:
        coco = json.load(f)

    id2file = {im["id"]: im["file_name"] for im in coco.get("images", [])}
    cat_id2name = {c["id"]: c["name"] for c in coco.get("categories", [])}
    anns_by_img = {}
    for ann in coco.get("annotations", []):
        anns_by_img.setdefault(ann["image_id"], []).append(ann)

    saved = 0
    for img_id, anns in tqdm(anns_by_img.items(), desc=f"Extract {images_dir.name}"):
        img_path = images_dir / id2file.get(img_id, "")
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        H, W = img.shape[:2]
        for i, ann in enumerate(anns):
            cname = cat_id2name.get(ann["category_id"], None)
            lab = COCO2OUR.get(cname, None)
            if lab is None:
                continue
            x,y,w,h = ann["bbox"]
            x0,y0 = max(0,int(x)), max(0,int(y))
            x1,y1 = min(W,int(x+w)), min(H,int(y+h))
            if x1<=x0 or y1<=y0: 
                continue
            crop = img[y0:y1, x0:x1]
            crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_AREA)
            fname = f"{img_path.stem}_{i}.jpg"
            cv2.imwrite(str(out_dir / lab / fname), crop)
            saved += 1
    return saved


In [5]:
total = 0
for split in ["train","valid","test"]:
    coco_json = ROBO_ROOT / split / "_annotations.coco.json"
    imgs_dir  = ROBO_ROOT / split
    if coco_json.exists():
        total += extract_from_coco(coco_json, imgs_dir, CELLS_PUBLIC, IMG_SIZE)
    else:
        print(f"[warn] missing split: {split}")
print("Saved patches from Roboflow =", total)


Extract train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 606/606 [00:05<00:00, 120.57it/s]
Extract valid: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 58/58 [00:00<00:00, 105.64it/s]
Extract test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:00<00:00, 66.05it/s]

Saved patches from Roboflow = 7083





## (Optional) Bootstrap from class videos (first-frame ‚Üí auto labels)

- On **Kaggle**, hidden test videos are not available ‚Üí this step is **skipped**.
- Locally, put class videos in `../data/public/videos/*.mp4` then run the cell below **after** you have `src/board.py` & `src/cells.py` available.


In [None]:
# Skip bootstrap step since we already have the data from earlier runs
print("Skipping bootstrap step - using existing data")
print(f"CELLS_BOOTSTRAP contains the Empty cells already extracted")

# Count existing files
empty_bootstrap = list((CELLS_BOOTSTRAP / "Empty").glob("*.jpg"))
print(f"Found {len(empty_bootstrap)} Empty cells in bootstrap")

if len(empty_bootstrap) == 0:
    print("Warning: No Empty cells found. You may need to run the bootstrap extraction manually.")

In [6]:
from pathlib import Path

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö
empty_bootstrap = list((CELLS_BOOTSTRAP / "Empty").glob("*.jpg"))
empty_public = list((CELLS_PUBLIC / "Empty").glob("*.jpg"))

print("=" * 60)
print("üìÅ Empty Dataset Location Check")
print("=" * 60)
print(f"CELLS_BOOTSTRAP/Empty: {len(empty_bootstrap)} files")
print(f"CELLS_PUBLIC/Empty: {len(empty_public)} files")
print(f"Total: {len(empty_bootstrap) + len(empty_public)} files")
print("=" * 60)

# ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå ‡πÅ‡∏™‡∏î‡∏á‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á
if empty_bootstrap:
    print(f"\n‚úÖ Sample from bootstrap: {empty_bootstrap[0]}")
if empty_public:
    print(f"‚úÖ Sample from public: {empty_public[0]}")

# ‡πÅ‡∏™‡∏î‡∏á path ‡πÄ‡∏ï‡πá‡∏°
print(f"\nüìÇ CELLS_BOOTSTRAP = {CELLS_BOOTSTRAP.resolve()}")
print(f"üìÇ CELLS_PUBLIC = {CELLS_PUBLIC.resolve()}")


üìÅ Empty Dataset Location Check
CELLS_BOOTSTRAP/Empty: 1084 files
CELLS_PUBLIC/Empty: 0 files
Total: 1084 files

‚úÖ Sample from bootstrap: ..\data\bootstrap\cells\Empty\2_Move_rotate_student_20.jpg

üìÇ CELLS_BOOTSTRAP = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\bootstrap\cells
üìÇ CELLS_PUBLIC = C:\Users\worap\Downloads\image_processing_term_orject\Chess_Detection_Competition\data\public\cells


In [7]:
# Skip augmentation since we already have sufficient Empty cells
print("Skipping augmentation - using existing dataset")

empty_bootstrap = list((CELLS_BOOTSTRAP / "Empty").glob("*.jpg"))
empty_public = list((CELLS_PUBLIC / "Empty").glob("*.jpg"))

print(f"CELLS_BOOTSTRAP/Empty: {len(empty_bootstrap)} files")
print(f"CELLS_PUBLIC/Empty: {len(empty_public)} files") 
print(f"Total: {len(empty_bootstrap) + len(empty_public)} files")

if len(empty_bootstrap) + len(empty_public) >= 300:
    print("SUCCESS: Sufficient Empty cells for training!")
else:
    print("WARNING: May need more Empty cells")

Skipping augmentation - using existing dataset
CELLS_BOOTSTRAP/Empty: 1084 files
CELLS_PUBLIC/Empty: 0 files
Total: 1084 files
SUCCESS: Sufficient Empty cells for training!


In [8]:
# Dataset already exists from previous run - just verify it
print("Verifying existing dataset...")

import os
from pathlib import Path

# OLD dataset (original)
FINAL_TRAIN = Path('../data/final/train')
FINAL_VAL = Path('../data/final/val')

# NEW balanced dataset (with augmented Empty cells)
BALANCED_CELLS = Path('../data/balanced/cells')

CLASSES_13 = ["Empty","WP","WN","WB","WR","WQ","WK","BP","BN","BB","BR","BQ","BK"]

print("="*70)
print("üìä DATASET COMPARISON")
print("="*70)

# Show OLD dataset
if FINAL_TRAIN.exists() and FINAL_VAL.exists():
    print("\nüî¥ OLD DATASET (data/final/train + val) - IMBALANCED:")
    print("-"*70)
    
    total_train = 0
    total_val = 0
    
    for c in CLASSES_13:
        train_count = len(list((FINAL_TRAIN / c).glob("*.jpg"))) if (FINAL_TRAIN / c).exists() else 0
        val_count = len(list((FINAL_VAL / c).glob("*.jpg"))) if (FINAL_VAL / c).exists() else 0
        total = train_count + val_count
        total_train += train_count
        total_val += val_count
        
        # Highlight Empty class
        marker = " ‚ö†Ô∏è LOW!" if c == "Empty" else ""
        print(f"{c:>5}  train={train_count:4d}  val={val_count:4d}  total={total:4d}{marker}")
    
    print(f"\nTotal: train={total_train} val={total_val} grand_total={total_train + total_val}")

# Show NEW balanced dataset
if BALANCED_CELLS.exists():
    print("\n‚úÖ NEW BALANCED DATASET (data/balanced/cells) - FIXED:")
    print("-"*70)
    
    total_balanced = 0
    
    for c in CLASSES_13:
        count = len(list((BALANCED_CELLS / c).glob("*.jpg"))) if (BALANCED_CELLS / c).exists() else 0
        total_balanced += count
        
        # Highlight Empty class improvement
        marker = " ‚ú® IMPROVED!" if c == "Empty" else ""
        print(f"{c:>5}  {count:4d} images{marker}")
    
    print(f"\nTotal: {total_balanced} images")
    
    # Show the improvement
    old_empty = len(list((FINAL_TRAIN / "Empty").glob("*.jpg"))) if (FINAL_TRAIN / "Empty").exists() else 0
    new_empty = len(list((BALANCED_CELLS / "Empty").glob("*.jpg"))) if (BALANCED_CELLS / "Empty").exists() else 0
    
    print("\n"+"="*70)
    print(f"üéØ Empty cell improvement: {old_empty} ‚Üí {new_empty} ({new_empty/old_empty:.1f}x increase!)")
    print("="*70)
    
    print("\n‚úÖ Balanced dataset ready for retraining!")
    print("   This will fix the Empty cell classification problem!")
else:
    print("\n‚ùå NEW balanced dataset not found!")
    print("   Run: python scripts/fix_dataset_balance.py")

Verifying existing dataset...
üìä DATASET COMPARISON

üî¥ OLD DATASET (data/final/train + val) - IMBALANCED:
----------------------------------------------------------------------
Empty  train= 815  val=  90  total= 905 ‚ö†Ô∏è LOW!
   WP  train=2913  val= 323  total=3236
   WN  train= 850  val=  94  total= 944
   WB  train= 774  val=  86  total= 860
   WR  train= 842  val=  93  total= 935
   WQ  train= 497  val=  55  total= 552
   WK  train= 653  val=  72  total= 725
   BP  train=3025  val= 336  total=3361
   BN  train= 876  val=  97  total= 973
   BB  train= 616  val=  68  total= 684
   BR  train= 907  val= 100  total=1007
   BQ  train= 387  val=  43  total= 430
   BK  train= 635  val=  70  total= 705

Total: train=13790 val=1527 grand_total=15317

‚úÖ NEW BALANCED DATASET (data/balanced/cells) - FIXED:
----------------------------------------------------------------------
Empty  1784 images ‚ú® IMPROVED!
   WP  3236 images
   WN   944 images
   WB   860 images
   WR   935 images
  