In [1]:
# build_subset_robust.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

TARGET_TOTAL = 900
FAIRFACE_RATIO = 0.60     # 60/40 split FairFace/UTKFace
SEED = 42

# Races you care about (adjust if needed)
RACES = ['White','Black','EastAsian','SouthAsian','SoutheastAsian','MiddleEastern','Latino']

# 1) Load pool
pool = pd.read_csv("../data/pool.csv")
pool = pool[pool['race_cat'].isin(RACES)].copy()

# Sanity: make a unique key for safety (image_id should be unique already)
#assert pool['image_id'].is_unique, "image_id not unique; check pool building."

# 2) Decide per-source totals
N_ff  = int(TARGET_TOTAL * FAIRFACE_RATIO)
N_utk = TARGET_TOTAL - N_ff

# 3) Decide per-race quotas
#    Heuristic: equal quota per race overall, but races that only exist in FairFace
#    (e.g., SoutheastAsian, MiddleEastern, Latino) are pulled entirely from FairFace.
per_race_total = {r: TARGET_TOTAL // len(RACES) for r in RACES}
remainder = TARGET_TOTAL - sum(per_race_total.values())
# distribute the leftover  (makes totals exact)
for r in RACES[:remainder]:
    per_race_total[r] += 1

# Which races are “FF-only” in practice? (UTK may have 0 count for some)
counts_by_source = pool.groupby(['source_dataset','race_cat']).size().unstack(fill_value=0)
ff_only = [r for r in RACES if counts_by_source.get(r, pd.Series()).get('UTKFace', 0) == 0]

# 4) Split the per-race quota into per-source race quotas
per_race_ff  = {}
per_race_utk = {}
for r in RACES:
    t = per_race_total[r]
    if r in ff_only:
        per_race_ff[r]  = min(t, counts_by_source.get(r, pd.Series()).get('FairFace', 0))
        per_race_utk[r] = 0
    else:
        # default: 60/40 split, but cap by availability
        ff_avail  = counts_by_source.get(r, pd.Series()).get('FairFace', 0)
        utk_avail = counts_by_source.get(r, pd.Series()).get('UTKFace', 0)
        ff_q = min(int(round(t * FAIRFACE_RATIO)), ff_avail)
        utk_q = min(t - ff_q, utk_avail)
        # if UTK is too small, push remainder back to FairFace (if possible), and vice versa
        if ff_q + utk_q < t:
            deficit = t - (ff_q + utk_q)
            # try filling from whichever has remaining availability
            ff_room  = ff_avail  - ff_q
            utk_room = utk_avail - utk_q
            take_ff  = min(deficit, max(ff_room, 0))
            ff_q    += take_ff
            deficit -= take_ff
            if deficit > 0:
                utk_q += min(deficit, max(utk_room, 0))
        per_race_ff[r], per_race_utk[r] = ff_q, utk_q

# 5) Initial per-race sampling by source
rng = np.random.RandomState(SEED)
def sample_quota(df, n):
    n = max(0, int(n))
    if len(df) <= n:
        return df.copy()
    return df.sample(n=n, random_state=SEED)

picked_ff  = []
picked_utk = []
for r in RACES:
    df_ff  = pool[(pool.source_dataset=='FairFace') & (pool.race_cat==r)]
    df_utk = pool[(pool.source_dataset=='UTKFace') & (pool.race_cat==r)]
    picked_ff.append(sample_quota(df_ff,  per_race_ff[r]))
    picked_utk.append(sample_quota(df_utk, per_race_utk[r]))

subset = pd.concat(picked_ff + picked_utk, ignore_index=True).drop_duplicates('image_id')

# 6) Top-up if we’re short overall or per-source
def top_up(source_name, need, subset, pool, seed=42):
    """
    Add up to `need` rows from `pool` for a given `source_name`,
    excluding any image_id already in `subset`.
    """
    if need <= 0:
        return subset

    # Ensure image_id is string (avoid dtype mismatches in isin)
    pool = pool.copy()
    subset = subset.copy()
    pool['image_id'] = pool['image_id'].astype(str)
    subset['image_id'] = subset['image_id'].astype(str)

    # Anti-join by image_id
    in_subset = set(subset['image_id'])
    pool_src = pool[(pool['source_dataset'] == source_name) & (~pool['image_id'].isin(in_subset))]

    take = min(int(need), len(pool_src))
    if take > 0:
        add = pool_src.sample(n=take, random_state=seed)
        subset = pd.concat([subset, add], ignore_index=True).drop_duplicates('image_id')

    return subset


def count_source(df, name): return (df.source_dataset==name).sum()

def finalize_subset_exact(subset, pool, target_total, seed=42):
    """
    Ensure subset has exactly target_total rows.
    1) If short, fill from ANY remaining pool rows (no race/source constraints).
    2) If still short (pool exhausted), return best effort and warn.
    3) If over, downsample to exact target.
    """
    subset = subset.drop_duplicates('image_id').copy()
    cur = len(subset)
    if cur < target_total:
        deficit = target_total - cur
        # candidates = pool rows not already in subset
        pool_ids = set(pool['image_id'].astype(str))
        sub_ids  = set(subset['image_id'].astype(str))
        remaining_ids = list(pool_ids - sub_ids)
        if remaining_ids:
            remaining = pool[pool['image_id'].astype(str).isin(remaining_ids)]
            take = min(deficit, len(remaining))
            add = remaining.sample(n=take, random_state=seed)
            subset = pd.concat([subset, add], ignore_index=True).drop_duplicates('image_id')
            cur = len(subset)
            deficit = target_total - cur
        if deficit > 0:
            print(f"⚠️ Pool exhausted for final fill; short by {deficit}. Returning best available = {cur}.")
            return subset  # cannot reach target; dataset limit

    if len(subset) > target_total:
        subset = subset.sample(n=target_total, random_state=seed)

    return subset


# Exact per-source targets
need_ff  = N_ff  - count_source(subset, 'FairFace')
need_utk = N_utk - count_source(subset, 'UTKFace')

subset = top_up('FairFace', need_ff, subset, pool, seed=SEED)
subset = top_up('UTKFace',  need_utk, subset, pool, seed=SEED)


# If still short of overall TARGET_TOTAL (e.g., both sources exhausted some strata), fill from any remaining pool
still_short = TARGET_TOTAL - len(subset)
if still_short > 0:
    remain = pool.drop(subset.index, errors='ignore')
    fill = remain.sample(min(still_short, len(remain)), random_state=SEED)
    subset = pd.concat([subset, fill], ignore_index=True).drop_duplicates('image_id')

# Trim if we overshot due to rounding
if len(subset) > TARGET_TOTAL:
    subset = subset.sample(n=TARGET_TOTAL, random_state=SEED)

# Final sanity
# Force exact size using any remaining rows (no constraints)
subset = finalize_subset_exact(subset, pool, TARGET_TOTAL, seed=SEED)

ff_ct  = count_source(subset, 'FairFace')
utk_ct = count_source(subset, 'UTKFace')

if len(subset) != TARGET_TOTAL:
    # We tried everything; pool simply doesn't have enough unique rows left.
    print(f"⚠️ Subset has {len(subset)} rows; target {TARGET_TOTAL}. "
          f"This indicates pool exhaustion after filters/dedup.")
else:
    print(f"✅ Subset size: {len(subset)}  | FairFace: {ff_ct}  | UTKFace: {utk_ct}")


# 7) Train/val/test stratified split (by race)
train, temp = train_test_split(subset, test_size=0.30, random_state=SEED, stratify=subset['race_cat'])
val,   test = train_test_split(temp,   test_size=0.50, random_state=SEED, stratify=temp['race_cat'])

def tag(df, name): df=df.copy(); df['split']=name; return df
master = pd.concat([tag(train,'train'), tag(val,'val'), tag(test,'test')], ignore_index=True)

# Save
master.to_csv("../data/splits/master.csv", index=False)
print("✅ data/splits/master.csv written.")
print(master['source_dataset'].value_counts())
print(master['race_cat'].value_counts())


✅ Subset size: 900  | FairFace: 539  | UTKFace: 361
✅ data/splits/master.csv written.
source_dataset
FairFace    539
UTKFace     361
Name: count, dtype: int64
race_cat
White         342
Black         301
EastAsian     129
SouthAsian    128
Name: count, dtype: int64


  pool = pd.read_csv("../data/pool.csv")
