# Task 3 â€” YOLOv8 Enrichment (Run module: `src/yolo_detect.py`)

This notebook imports and runs:
- `D:\\Python\\Week 8\\Shipping-a-Data-Product\\src\\yolo_detect.py`

It generates:
- `D:\\Python\\Week 8\\Shipping-a-Data-Product\\data\\processed\\yolo\\detections.csv`

Then validates the CSV schema and prints quick summaries for the report.

## 0) Config

In [None]:
from pathlib import Path
import sys
import pandas as pd

PROJECT_ROOT = Path(r"D:\\Python\\Week 8\\Shipping-a-Data-Product")
SRC_DIR = PROJECT_ROOT / "src"

IMAGES_ROOT = PROJECT_ROOT / "data" / "raw" / "images"
CSV_PATH = PROJECT_ROOT / "data" / "processed" / "yolo" / "detections.csv"

MODEL_PATH = "yolov8n.pt"
CONF = 0.25

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SRC_DIR:", SRC_DIR)
print("IMAGES_ROOT:", IMAGES_ROOT)
print("CSV_PATH:", CSV_PATH)


## 1) Import `yolo_detect` from `src/`

In [None]:
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

import importlib
yolo_detect = importlib.import_module("yolo_detect")

print("Imported module:", yolo_detect)
print("Module file:", yolo_detect.__file__)
print("Has run():", hasattr(yolo_detect, "run"))


## 2) Run YOLO detection (calls `yolo_detect.run`)

In [None]:
# Run your module's function exactly as defined in src/yolo_detect.py
out_path = yolo_detect.run(
    images_root=IMAGES_ROOT,
    out_csv=CSV_PATH,
    model_path=MODEL_PATH,
    conf=CONF,
)

print("\nReturned path:", out_path)
print("CSV exists:", CSV_PATH.exists())
print("CSV size (bytes):", CSV_PATH.stat().st_size if CSV_PATH.exists() else None)


## 3) Validate CSV schema

Your module writes CSV with comma delimiter by default (`df.to_csv(out_csv, index=False)`).

Expected columns (12):
- image_path, channel_name, message_id, detected_class, confidence_score,
  bbox_x1, bbox_y1, bbox_x2, bbox_y2, image_category, model_name, inference_ts

In [None]:
EXPECTED_COLS = [
    "image_path",
    "channel_name",
    "message_id",
    "detected_class",
    "confidence_score",
    "bbox_x1",
    "bbox_y1",
    "bbox_x2",
    "bbox_y2",
    "image_category",
    "model_name",
    "inference_ts",
]

df = pd.read_csv(CSV_PATH)  # comma-separated
print("Rows:", len(df))
print("Columns:", list(df.columns))

missing = [c for c in EXPECTED_COLS if c not in df.columns]
extra = [c for c in df.columns if c not in EXPECTED_COLS]

print("Missing:", missing)
print("Extra:", extra)

assert not missing, f"Missing required columns: {missing}"
df.head(10)

## 4) Quick summaries for report

In [None]:
print("Image category distribution:")
display(df["image_category"].value_counts(dropna=False))


In [None]:
print("Top detected classes:")
top_classes = (
    df.dropna(subset=["detected_class"])
      .groupby("detected_class")
      .size()
      .reset_index(name="n")
      .sort_values("n", ascending=False)
      .head(20)
)
display(top_classes)


In [None]:
print("Channel visual content volume (unique images):")
img_level = df.drop_duplicates(subset=["image_path"])
display(img_level.groupby("channel_name").size().sort_values(ascending=False))


## Notes
- If YOLO weights download is slow the first time, that is expected.
- If you see an error like 'No images found', confirm images exist under:
  `data/raw/images/<channel>/<message_id>.jpg`.
