In [1]:
# manual_subset_v3.py
import math
import shutil
from pathlib import Path

import numpy as np
import pandas as pd

# ========= CONFIG =========
LABELS_CSV = Path("../data/labels_v3.csv")      # <-- adjust if needed
OUT_CSV    = Path("../data/manual/v3_manual_subset.csv")
OUT_IMGDIR = Path("../data/manual/v3_manual_subset_images")
IMAGES_ROOT = Path("../data")                    # base dir so (IMAGES_ROOT / rel_path) points to the image

# Choose how many rows to extract (must be between 80 and 100 inclusive)
N = 100

# Try to stratify by these columns if present (fallback to simpler ones automatically)
PREFERRED_GROUP_COLS = [
    ["source_dataset", "race_multi", "gender_multi"],   # multi-label style
    ["source_dataset", "race_cat", "gender_cat"],       # categorical style
    ["race_multi", "gender_multi"],
    ["race_cat", "gender_cat"],
    ["source_dataset"],
]

# Columns to keep for the manual sheet (these should exist or will be skipped if missing)
KEEP_BASE_COLS = [
    "image_id", "rel_path", "source_dataset",
    "race_multi", "race_cat",
    "gender_multi", "gender_cat",
    "age_cat", "skin_tone", "split"
]

# Blank annotation columns you asked for
ANNOTATION_COLS = [
    "cultural_markers", "ambiguous_mixed", "prefer_not_to_label",
    "unknown_uncertain", "conf_race", "conf_gender", "conf_skin",
    "annotation_notes"
]
# ==========================

def pick_group_cols(df: pd.DataFrame):
    for cols in PREFERRED_GROUP_COLS:
        if all(c in df.columns for c in cols):
            return cols
    return []  # no stratification

def stratified_sample(df: pd.DataFrame, group_cols, n_target: int, seed: int = 42):
    if not group_cols:
        # simple random sample if no group columns available
        return df.sample(min(n_target, len(df)), random_state=seed)

    # Remove groups with zero rows and build allocation by proportion
    grouped = df.groupby(group_cols, dropna=False)
    counts = grouped.size().reset_index(name="count")
    counts = counts[counts["count"] > 0].copy()

    total = counts["count"].sum()
    if total == 0:
        return df.sample(min(n_target, len(df)), random_state=seed)

    # initial quota (floor) and remainder ranking
    counts["quota_float"] = counts["count"] / total * n_target
    counts["quota"] = counts["quota_float"].apply(math.floor)
    allocated = counts["quota"].sum()
    remainder = n_target - allocated

    # Distribute remaining slots to groups with largest fractional parts
    counts["frac"] = counts["quota_float"] - counts["quota"]
    counts = counts.sort_values("frac", ascending=False)
    counts.iloc[:max(0, remainder), counts.columns.get_loc("quota")] += 1

    # Now sample within each group up to the group's size
    parts = []
    for _, row in counts.iterrows():
        mask = pd.Series([True] * len(df))
        for c in group_cols:
            mask &= (df[c] == row[c]) if not pd.isna(row[c]) else df[c].isna()
        g = df[mask]
        take = min(int(row["quota"]), len(g))
        if take > 0:
            parts.append(g.sample(take, random_state=seed))
    out = pd.concat(parts) if parts else pd.DataFrame(columns=df.columns)

    # If we fell short (because some tiny groups were smaller than quota), top up randomly
    if len(out) < n_target:
        remaining = df.drop(out.index, errors="ignore")
        top_up_n = min(n_target - len(out), len(remaining))
        if top_up_n > 0:
            out = pd.concat([out, remaining.sample(top_up_n, random_state=seed)])

    # If we overshot (shouldn't happen, but just in case), downsample
    if len(out) > n_target:
        out = out.sample(n_target, random_state=seed)

    return out

def main():
    assert 80 <= N <= 100, f"N must be between 80 and 100; got {N}"

    # Load labels_v3
    if not LABELS_CSV.exists():
        raise FileNotFoundError(f"Could not find {LABELS_CSV}")
    df = pd.read_csv(LABELS_CSV)

    # Minimal required columns
    for col in ["image_id", "rel_path"]:
        if col not in df.columns:
            raise KeyError(f"labels_v3.csv is missing required column: '{col}'")

    group_cols = pick_group_cols(df)
    subset = stratified_sample(df, group_cols, N)

    # Build manual sheet with base + blank annotation columns
    present_keep_cols = [c for c in KEEP_BASE_COLS if c in subset.columns]
    manual_df = subset[present_keep_cols].copy()
    for col in ANNOTATION_COLS:
        manual_df[col] = ""  # leave blank for human entry

    # Ensure output dirs
    OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    manual_df.to_csv(OUT_CSV, index=False)

    # Copy images into a side folder (optional; disable by commenting the block)
    OUT_IMGDIR.mkdir(parents=True, exist_ok=True)
    copied, missing = 0, 0
    for rel in manual_df["rel_path"]:
        src = (IMAGES_ROOT / rel).resolve()
        if src.exists():
            dst = OUT_IMGDIR / src.name
            if not dst.exists():
                shutil.copy2(src, dst)
            copied += 1
        else:
            missing += 1

    # Print a short summary so you can sanity-check the balance
    print("âœ… Wrote:", OUT_CSV)
    print(f"ðŸ§¾ Rows: {len(manual_df)} | Copied images: {copied} | Missing files: {missing}")
    if group_cols:
        print("\nStratification counts:")
        print(manual_df.groupby([c for c in group_cols if c in manual_df.columns]).size())

if __name__ == "__main__":
    main()


âœ… Wrote: ../data/manual/v3_manual_subset.csv
ðŸ§¾ Rows: 100 | Copied images: 59 | Missing files: 41

Stratification counts:
source_dataset  race_cat    gender_cat
FairFace        Black       Female        13
                            Male          14
                White       Female        13
                            Male          19
UTKFace         Black       Female         3
                            Male           3
                EastAsian   Female         8
                            Male           7
                SouthAsian  Female         7
                            Male           7
                White       Female         3
                            Male           3
dtype: int64
