# Test Model Evaluations

## **Disclaimer**
Optional Unfinished Notebook file. Containing indepth measures to analyse test split data validation.

In [2]:
import json
from pathlib import Path
import pandas as pd
import os

In [None]:
# PAths & Directories
NB_DIR = Path.cwd()
REPO_ROOT = NB_DIR.parent

test_dir_path =  REPO_ROOT / 'data/processed/images/test'

In [37]:
import json
from pathlib import Path

# --- paths & directories---
REPO_ROOT = Path.cwd().parent
COCO_IN   = REPO_ROOT / 'data/processed/JSONs/train_annotations_coco.json'   # source (full) COCO
COCO_OUT  = REPO_ROOT / 'data/processed/JSONs/test_annotations_coco.json'    # where to write subset

# Gather Test Images
test_names = {p.name for p in (REPO_ROOT/'data/processed/images/test').glob('*.tif')}

# --- load full COCO ---
with open(COCO_IN, 'r') as f:
    coco = json.load(f)

# Index images by file_name for O(1) lookups
name_to_img = {img['file_name']: img for img in coco['images']}

# Warn about any filenames in the test split that aren’t in COCO
missing = sorted(test_names - set(name_to_img.keys()))
if missing:
    print(f"⚠️ {len(missing)} test images not found in COCO JSON (showing up to 5): {missing[:5]}")

# Keep only images present in COCO
subset_images = [name_to_img[n] for n in test_names if n in name_to_img]
subset_image_ids = {img['id'] for img in subset_images}

# Keep only annotations that belong to those images
subset_anns = [ann for ann in coco['annotations'] if ann['image_id'] in subset_image_ids]

# (optional) keep only categories that are actually used in this subset
used_cat_ids = {ann['category_id'] for ann in subset_anns}
subset_cats = [c for c in coco['categories'] if c['id'] in used_cat_ids]

# Assemble subset COCO
subset = {
    'images': subset_images,
    'annotations': subset_anns,
    'categories': subset_cats,
    'info' : [],
    'licenses' : []
}

# Write
COCO_OUT.parent.mkdir(parents=True, exist_ok=True)
with open(COCO_OUT, 'w') as f:
    json.dump(subset, f, indent=2)

print(f"✅ Wrote {COCO_OUT}")
print(f"   {len(subset_images)} images, {len(subset_anns)} annotations, {len(subset_cats)} categories")


✅ Wrote /Users/mitchellpalmer/Projects/solafune-canopy-capstone-clean/data/processed/JSONs/test_annotations_coco.json
   23 images, 6569 annotations, 2 categories


In [38]:
import json, numpy as np
from pathlib import Path
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# ---- paths
REPO_ROOT = Path.cwd().parent  # adjust if needed
gt_json   = REPO_ROOT/'data/processed/JSONs/test_annotations_coco.json'
pred_json = REPO_ROOT/'runs/segment/Yolo8sTest19/predictions.json'  # <- your run
out_json  = REPO_ROOT/'runs/segment/Yolo8sTest19/predictions_coco.json'

# ---- load GT & build filename -> image_id map
gt = COCO(str(gt_json))                  # also validates schema
with open(gt_json) as f:
    gt_raw = json.load(f)
name_to_id = {im['file_name']: im['id'] for im in gt_raw['images']}
gt_cat_ids = {c['id'] for c in gt_raw['categories']}   # e.g., {1,2}

# ---- load Ultralytics predictions
with open(pred_json) as f:
    preds = json.load(f)                 # list of dicts

# Detect whether prediction category ids are 0-based (e.g., {0,1}) and map to GT
pred_cat_ids = {int(p['category_id']) for p in preds}
needs_plus_one = (min(pred_cat_ids) == 0 and min(gt_cat_ids) == 1)

results = []
skipped_no_image = 0
for d in preds:
    # Ultralytics writes the filename into image_id (string). Map to numeric id.
    fname = d.get('image_id') or d.get('file_name')
    fname = fname + '.tif'
    if fname not in name_to_id:
        skipped_no_image += 1
        continue

    cat_id = int(d['category_id'])
    if needs_plus_one:
        cat_id = cat_id + 1

    rec = {
        'image_id': name_to_id[fname],
        'category_id': cat_id,
        'score': float(d['score']),
    }
    # keep whichever you want to evaluate: bbox and/or segm
    if 'bbox' in d:
        rec['bbox'] = [float(x) for x in d['bbox']]     # [x,y,w,h]
    if 'segmentation' in d:
        # pycocotools accepts COCO RLE dict: {'size':[H,W], 'counts': <str or bytes>}
        rec['segmentation'] = d['segmentation']
    results.append(rec)

print(f"Converted {len(results)} detections "
      f"(skipped {skipped_no_image} with unknown filenames).")

# ---- write COCO-results file
out_json.parent.mkdir(parents=True, exist_ok=True)
with open(out_json, 'w') as f:
    json.dump(results, f)


loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Converted 2300 detections (skipped 0 with unknown filenames).


In [73]:
import json
from pathlib import Path
import numpy as np
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

def eval_with_cocoeval(gt_json, yolo_preds_json, iou=0.75, iouType='segm'):
    """
    gt_json: COCO GT file (has images/annotations/categories)
    yolo_preds_json: Ultralytics predictions.json (list of det dicts, or a dict
                     from which we can pull a list)
    iouType: 'segm' for masks, 'bbox' for boxes
    """
    cocoGt = COCO(str(gt_json))

    # Build maps so we can turn filename strings into integer image IDs
    fname2id = {img["file_name"]: img["id"] for img in cocoGt.dataset["images"]}
    stem2id = {Path(k).stem: v for k, v in fname2id.items()}
    valid_img_ids = set(fname2id.values())
    valid_cat_ids = set(cocoGt.getCatIds())

    # Load predictions and normalize to a LIST of result dicts
    with open(yolo_preds_json) as f:
        data = json.load(f)
    dets = data["annotations"] if isinstance(data, dict) and "annotations" in data else data
    assert isinstance(dets, list), "Predictions must be a LIST of detection objects"

    norm = []
    for d in dets:
        # --- image_id: convert filename -> int id if needed
        iid = d.get("image_id")
        if isinstance(iid, str):
            iid = (
                fname2id.get(iid)
                or fname2id.get(iid + ".tif")
                or fname2id.get(iid + ".png")
                or stem2id.get(iid)
            )
            if iid is None:
                continue  # skip preds not in the GT split
        if iid not in valid_img_ids:
            continue

        # --- category_id: fix 0/1 vs 1/2 if necessary
        cid = int(d["category_id"])
        if cid not in valid_cat_ids and (cid + 1) in valid_cat_ids:
            cid += 1
        if cid not in valid_cat_ids:
            continue

        out = {
            "image_id": int(iid),
            "category_id": cid,
            "score": float(d["score"]),
        }
        if "bbox" in d:
            x, y, w, h = d["bbox"]
            out["bbox"] = [float(x), float(y), float(w), float(h)]  # [x,y,w,h]
        if "segmentation" in d:
            out["segmentation"] = d["segmentation"]  # RLE/polygons are OK as-is

        norm.append(out)

    # Feed detections to COCO (must be a LIST)
    cocoDt = cocoGt.loadRes(norm)

    # Evaluate at IoU = 0.75 only
    cocoEvaluation = COCOeval(cocoGt, cocoDt, iouType=iouType)
    cocoEvaluation.params.iouThrs = np.array([iou], dtype=np.float64)
    cocoEvaluation.evaluate()
    cocoEvaluation.accumulate()
    cocoEvaluation.summarize()

# ---- use it ----
gt = REPO_ROOT/ "data/processed/JSONs/test_annotations_coco.json"
preds = REPO_ROOT/"runs/segment/Yolo8sTest19/predictions.json"

print("== BOUNDS (bbox) at IoU=0.75 ==")
eval_with_cocoeval(gt, preds, iou=0.75, iouType="bbox")

print("\n== MASKS (segm) at IoU=0.75 ==")
eval_with_cocoeval(gt, preds, iou=0.75, iouType="segm")


== BOUNDS (bbox) at IoU=0.75 ==
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.33s).
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=   all | maxDets=100 ] = 0.056
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.056
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= small | maxDets=100 ] = 0.093
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= large | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=  1 ] = 0.002
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets= 10 ] = 0.021
 Average Recall     (AR) @[ Io

In [65]:
import json, numpy as np
from pathlib import Path
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# ---- paths
REPO_ROOT = Path.cwd().parent  # adjust if needed
groundtruth_json   = REPO_ROOT/'data/processed/JSONs/test_annotations_coco.json'
predictions_json = REPO_ROOT/'runs/segment/Yolo8sTest19/predictions_coco.json'  # <- your run

In [66]:
coco_groundtruth = COCO(groundtruth_json)

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


In [75]:
with open(predictions_json) as f:
    detections = json.load(f)
coco_detections = coco_groundtruth.loadRes(detections)

Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!


In [89]:
Evaluation = COCOeval(coco_groundtruth, coco_detections)

Evaluation.params.iouThrs = np.array([0.75], dtype=np.float64)   # evaluate only at IoU=0.75
Evaluation.params.maxDets = [100, 200, 4000]                     # change max detections per image
Evaluation.params.iouType = 'segm'
Evaluation.params.useCats = False
Evaluation.params.catIds = coco_groundtruth.getCatIds()

Evaluation.evaluate()
Evaluation.accumulate()
Evaluation.summarize()

Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=0.45s).
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=   all | maxDets=100 ] = 0.049
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=4000 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=4000 ] = 0.049
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= small | maxDets=4000 ] = 0.080
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=medium | maxDets=4000 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= large | maxDets=4000 ] = -1.000
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=100 ] = 0.080
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=200 ] = 0.080
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=4000 ] = 0.080
 Average Recall     (AR) @[ IoU=0.75:0.75 | area= small | maxDets=4000 ] = 0.080
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=medium | m

In [90]:
Evaluation = COCOeval(coco_groundtruth, coco_detections)

Evaluation.params.iouThrs = np.array([0.75], dtype=np.float64)   # evaluate only at IoU=0.75
Evaluation.params.maxDets = [100, 200, 4000]                     # change max detections per image
Evaluation.params.iouType = 'segm'
Evaluation.params.useCats = True
Evaluation.params.catIds = coco_groundtruth.getCatIds()

Evaluation.evaluate()
Evaluation.accumulate()
Evaluation.summarize()

Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=0.34s).
Accumulating evaluation results...
DONE (t=0.00s).
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=   all | maxDets=100 ] = 0.031
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=4000 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=4000 ] = 0.031
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= small | maxDets=4000 ] = 0.058
 Average Precision  (AP) @[ IoU=0.75:0.75 | area=medium | maxDets=4000 ] = -1.000
 Average Precision  (AP) @[ IoU=0.75:0.75 | area= large | maxDets=4000 ] = -1.000
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=100 ] = 0.057
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=200 ] = 0.057
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=   all | maxDets=4000 ] = 0.057
 Average Recall     (AR) @[ IoU=0.75:0.75 | area= small | maxDets=4000 ] = 0.057
 Average Recall     (AR) @[ IoU=0.75:0.75 | area=medium | m

In [93]:
precision = Evaluation.eval['precision']          # [T,R,K,A,M]
t = np.where(np.isclose(Evaluation.params.iouThrs, 0.75))[0][0]  # number of IoU thresholds. I chose IoU=0.75  -> T index
k = 0   # number of categories / choose category index (0..K-1)                 -> K index
a = 0   # choose area range (0=all)                      -> A index
m = -1  # choose the largest maxDets (usually last)      -> M index

pr  = precision[t, :, k, a, m]            # precision at each recall sample R
rec = Evaluation.params.recThrs           # the recall grid (x-axis)
mask = pr > -1                   # COCO fills missing with -1
rec, pr = rec[mask], pr[mask]
f1 = 2*pr*rec/(pr+rec+1e-12)

print('T (IoUs):', Evaluation.params.iouThrs)
print('A (areas):', Evaluation.params.areaRngLbl)
print('M (maxDets):', Evaluation.params.maxDets)
cats = coco_groundtruth.loadCats(Evaluation.params.catIds)
print('K (classes):', [c['name'] for c in cats])


T (IoUs): [0.75]
A (areas): ['all', 'small', 'medium', 'large']
M (maxDets): [100, 200, 4000]
K (classes): ['individual_tree', 'group_of_trees']


In [97]:
scores = Evaluation.eval['scores'][t, :, k, a, m][mask]  # confidence per PR point

scores
#That’s all those indices mean:

# t → which IoU threshold,

# ( : ) on axis R → sweep recall points for the PR curve,

# k → which class,

# a → which area bin,

# m → which maxDet setting.

array([0.98851, 0.96049, 0.94744, 0.93647, 0.92645, 0.90846, 0.87486,
       0.76867, 0.32044, 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
       0.     , 0.  

In [100]:
from pathlib import Path

REPO_ROOT

img_dir = Path(REPO_ROOT/ "data/processed/images/test")
lab_dir = Path(REPO_ROOT/"data/processed/labels/test")

img_stems = {p.stem for p in img_dir.glob("*.*")}     # *.tif, *.png, etc.
lab_stems = {p.stem for p in lab_dir.glob("*.txt")}

print("Labels with no image:", sorted(lab_stems - img_stems))
print("Images with no label:", sorted(img_stems - lab_stems))


Labels with no image: []
Images with no label: ['.DS_Store']


In [102]:
import cv2
from pathlib import Path

bad = []
for p in Path(REPO_ROOT/"data/processed/images/test").glob("*.*"):
    im = cv2.imread(str(p), cv2.IMREAD_UNCHANGED)
    if im is None:
        bad.append((p.name, "cv2.imread returned None"))
    else:
        h, w = im.shape[:2]
        if h == 0 or w == 0:
            bad.append((p.name, f"zero shape {im.shape}"))

print("Problem images:", bad)


Problem images: [('.DS_Store', 'cv2.imread returned None')]
