In [5]:
# === Cell 0: Imports + Project Paths ===

import os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from skimage.feature import hog

# ---------------------------------------
# ✅ Base project folder (your actual path)
# ---------------------------------------
BASE = Path(r"c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project")

# ---------------------------------------
# ✅ Data directories
# ---------------------------------------
DATA = BASE / "data"
PROC_DIR = DATA / "processed_images"      # processed 800x600 images
LABELS_CSV = DATA / "labels.csv"          # final labels file
SPLITS_DIR = DATA / "splits"              # where train/val/test CSVs are stored

# ---------------------------------------
# ✅ Output directory for feature arrays
# ---------------------------------------
FEAT_DIR = DATA / "features"
FEAT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------------
# ✅ Grid / cell parameters
# ---------------------------------------
IMG_W, IMG_H = 800, 600          # processed image size
CELL_W, CELL_H = IMG_W // 8, IMG_H // 8   # 100x75 per cell

# ---------------------------------------
# ✅ HOG + Histogram config
# ---------------------------------------
HOG_PARAMS = dict(
    orientations=9,
    pixels_per_cell=(8, 8),
    cells_per_block=(2, 2),
    block_norm='L2-Hys',
    feature_vector=True
)

HIST_BINS = 16   # bins per channel for color hist (RGB + HSV = 6*16 = 96 dims)

# ---------------------------------------
# ✅ Verify paths
# ---------------------------------------
print("BASE:        ", BASE)
print("DATA:        ", DATA, "exists:", DATA.exists())
print("Processed:   ", PROC_DIR, "exists:", PROC_DIR.exists())
print("Labels CSV:  ", LABELS_CSV, "exists:", LABELS_CSV.exists())
print("Splits DIR:  ", SPLITS_DIR, "exists:", SPLITS_DIR.exists())
print("Features DIR:", FEAT_DIR, "exists:", FEAT_DIR.exists())


BASE:         c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project
DATA:         c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data exists: True
Processed:    c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data\processed_images exists: True
Labels CSV:   c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data\labels.csv exists: True
Splits DIR:   c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data\splits exists: True
Features DIR: c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data\features exists: True


In [6]:
# === Cell 1: Load labels + splits ===
import pandas as pd

# Load labels
labels = pd.read_csv(LABELS_CSV)

# Normalize filename column to 'image'
if "image" not in labels.columns:
    candidates = [c for c in labels.columns if c.lower() in
                  ["imagefilename","image_file_name","filename","image","image_name","file","file_name"]]
    if not candidates:
        raise ValueError("Couldn't find filename column in labels.csv")
    labels = labels.rename(columns={candidates[0]: "image"})

# Ensure c01..c64 exist
cell_cols = [f"c{i:02d}" for i in range(1, 65)]
missing = [c for c in cell_cols if c not in labels.columns]
if missing:
    raise ValueError(f"labels.csv missing columns: {missing[:5]} ...")

# Load splits (stored inside data/splits)
train_imgs = pd.read_csv(SPLITS_DIR / "train_images.csv", header=None)[0].tolist()
val_imgs   = pd.read_csv(SPLITS_DIR / "val_images.csv",   header=None)[0].tolist()
test_imgs  = pd.read_csv(SPLITS_DIR / "test_images.csv",  header=None)[0].tolist()

print(f"Split sizes  | train={len(train_imgs)}  val={len(val_imgs)}  test={len(test_imgs)}")
print("Example files:", train_imgs[:3])


Split sizes  | train=383  val=21  test=22
Example files: ['CIMG0254.JPG', 'CIMG0191.JPG', 'CIMG0975.JPG']


In [7]:
# === Cell 2: Feature extraction (HOG + color histograms) ===
import numpy as np
import cv2
from skimage.feature import hog

def _to_bgr(img):
    """Ensure BGR uint8 image."""
    if img is None:
        return None
    if img.ndim == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    if img.shape[2] == 4:
        img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
    return img

def color_hist_feats(bgr, bins=HIST_BINS):
    """RGB + HSV histograms concatenated → 6*bins features."""
    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
    feats = []
    for space in (rgb, hsv):
        for ch in range(3):
            h, _ = np.histogram(space[:, :, ch], bins=bins, range=(0, 255), density=True)
            feats.append(h.astype(np.float32))
    return np.concatenate(feats)

def hog_feats(bgr, hog_params=HOG_PARAMS):
    """HOG on grayscale."""
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    h = hog(gray, **hog_params)
    return h.astype(np.float32)

def extract_features(cell_bgr):
    """Concat: [color_hist (96 dims with 16 bins) | HOG]."""
    bgr = _to_bgr(cell_bgr)
    if bgr is None:
        return None
    ch = color_hist_feats(bgr, bins=HIST_BINS)
    hg = hog_feats(bgr, hog_params=HOG_PARAMS)
    return np.concatenate([ch, hg]).astype(np.float32)


In [8]:
# === Cell 3: 8x8 grid slicing ===
def slice_cells(img):
    """Return list of 64 cell crops in row-major order."""
    h, w = img.shape[:2]
    if (w, h) != (IMG_W, IMG_H):
        raise ValueError(f"Processed image must be {IMG_W}x{IMG_H}, got {w}x{h}")
    cells = []
    for r in range(8):
        for c in range(8):
            y0, y1 = r * CELL_H, (r + 1) * CELL_H
            x0, x1 = c * CELL_W, (c + 1) * CELL_W
            cells.append(img[y0:y1, x0:x1])
    return cells


In [9]:
# === Cell 4: Build dataset (features X, labels y) for a list of images ===
from tqdm import tqdm

def build_dataset(image_list, labels_df):
    X, y = [], []
    not_found = 0
    unlabeled = 0

    for img_name in tqdm(image_list):
        img_path = PROC_DIR / img_name
        if not img_path.exists():
            not_found += 1
            continue

        img = cv2.imread(str(img_path))
        if img is None:
            continue

        cells = slice_cells(img)

        row = labels_df[labels_df["image"] == img_name]
        if row.empty:
            unlabeled += 1
            continue
        row = row.iloc[0]
        cell_labels = row[[f"c{i:02d}" for i in range(1, 65)]].to_numpy(dtype=int)

        for cell_img, lab in zip(cells, cell_labels):
            feats = extract_features(cell_img)
            if feats is None:
                continue
            X.append(feats)
            y.append(lab)

    if not_found:
        print(f"[WARN] {not_found} files listed in split not found under processed_images/")
    if unlabeled:
        print(f"[WARN] {unlabeled} files had no label row in labels.csv")

    X = np.vstack(X).astype(np.float32)
    y = np.array(y, dtype=np.int64)
    return X, y


In [10]:
# === Cell 5: Run + save ===
X_train, y_train = build_dataset(train_imgs, labels)
X_val,   y_val   = build_dataset(val_imgs,   labels)
X_test,  y_test  = build_dataset(test_imgs,  labels)

np.save(FEAT_DIR / "X_train.npy", X_train)
np.save(FEAT_DIR / "y_train.npy", y_train)
np.save(FEAT_DIR / "X_val.npy",   X_val)
np.save(FEAT_DIR / "y_val.npy",   y_val)
np.save(FEAT_DIR / "X_test.npy",  X_test)
np.save(FEAT_DIR / "y_test.npy",  y_test)

print("Saved to:", FEAT_DIR)
print("Shapes  | X_train", X_train.shape, "| X_val", X_val.shape, "| X_test", X_test.shape)
print("PosRate | train", y_train.mean().round(4), "| val", y_val.mean().round(4), "| test", y_test.mean().round(4))


100%|██████████| 383/383 [01:58<00:00,  3.23it/s]
100%|██████████| 21/21 [00:06<00:00,  3.26it/s]
100%|██████████| 22/22 [00:06<00:00,  3.30it/s]


Saved to: c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\data\features
Shapes  | X_train (24512, 3264) | X_val (1344, 3264) | X_test (1408, 3264)
PosRate | train 0.2681 | val 0.2507 | test 0.2784


In [11]:
# === Cell 6: Sanity checks (optional) ===
# Check one image visually & first feature length
sample = train_imgs[0]
im = cv2.imread(str(PROC_DIR / sample))
cells = slice_cells(im)
f0 = extract_features(cells[0])

print("One cell feature length:", len(f0))
assert f0.ndim == 1 and np.isfinite(f0).all(), "Feature vector has NaNs/inf"

# Distribution of feature magnitudes (quick glance)
print("Feature stats | mean:", f0.mean().round(4), "std:", f0.std().round(4), "min:", f0.min().round(4), "max:", f0.max().round(4))


One cell feature length: 3264
Feature stats | mean: 0.1345 std: 0.0942 min: 0.0 max: 0.3534
