# Robust Subset Builder (FairFace/UTKFace)
_Auto-generated on 2025-11-08 05:53:19_

This notebook builds a stratified subset of size `TARGET_TOTAL` from a combined pool (`data/pool.csv`) with a target source split (e.g., 60% FairFace / 40% UTKFace). It allocates per-race quotas, handles source availability, tops-up if needed, and freezes train/val/test splits stratified by race.

## Imports & Constants

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

TARGET_TOTAL = 900
FAIRFACE_RATIO = 0.60     # 60/40 split FairFace/UTKFace
SEED = 42

# Races you care about (adjust if needed)
RACES = ['White','Black','EastAsian','SouthAsian','SoutheastAsian','MiddleEastern','Latino']

## 1) Load pool and filter races

In [None]:
pool = pd.read_csv('data/pool.csv')
pool = pool[pool['race_cat'].isin(RACES)].copy()

# Sanity: image_id uniqueness
assert pool['image_id'].is_unique, 'image_id not unique; check pool building.'
print('Pool shape after race filter:', pool.shape)
pool.head()

## 2) Decide per-source totals

In [None]:
N_ff  = int(TARGET_TOTAL * FAIRFACE_RATIO)
N_utk = TARGET_TOTAL - N_ff
print('Target totals → FairFace:', N_ff, '| UTKFace:', N_utk)

## 3) Per-race quotas and availability by source

In [None]:
# Equal quota per race, distribute any remainder deterministically by order
per_race_total = {r: TARGET_TOTAL // len(RACES) for r in RACES}
remainder = TARGET_TOTAL - sum(per_race_total.values())
for r in RACES[:remainder]:
    per_race_total[r] += 1
print('Per-race totals:', per_race_total)

counts_by_source = pool.groupby(['source_dataset','race_cat']).size().unstack(fill_value=0)
display(counts_by_source)

# Races that have no UTKFace availability
ff_only = [r for r in RACES if counts_by_source.get(r, pd.Series()).get('UTKFace', 0) == 0]
print('FF-only races (no UTK availability):', ff_only)

## 4) Split per-race quota into per-source quotas

In [None]:
per_race_ff  = {}
per_race_utk = {}
for r in RACES:
    t = per_race_total[r]
    if r in ff_only:
        per_race_ff[r]  = min(t, counts_by_source.get(r, pd.Series()).get('FairFace', 0))
        per_race_utk[r] = 0
    else:
        ff_avail  = counts_by_source.get(r, pd.Series()).get('FairFace', 0)
        utk_avail = counts_by_source.get(r, pd.Series()).get('UTKFace', 0)
        ff_q = min(int(round(t * FAIRFACE_RATIO)), ff_avail)
        utk_q = min(t - ff_q, utk_avail)
        if ff_q + utk_q < t:
            deficit = t - (ff_q + utk_q)
            ff_room  = ff_avail  - ff_q
            utk_room = utk_avail - utk_q
            take_ff  = min(deficit, max(ff_room, 0))
            ff_q    += take_ff
            deficit -= take_ff
            if deficit > 0:
                utk_q += min(deficit, max(utk_room, 0))
        per_race_ff[r], per_race_utk[r] = ff_q, utk_q

print('Per-race FairFace quotas:', per_race_ff)
print('Per-race UTKFace quotas:', per_race_utk)

## 5) Initial per-race sampling by source

In [None]:
rng = np.random.RandomState(SEED)

def sample_quota(df, n):
    n = max(0, int(n))
    if len(df) <= n:
        return df.copy()
    return df.sample(n=n, random_state=SEED)

picked_ff  = []
picked_utk = []
for r in RACES:
    df_ff  = pool[(pool.source_dataset=='FairFace') & (pool.race_cat==r)]
    df_utk = pool[(pool.source_dataset=='UTKFace') & (pool.race_cat==r)]
    picked_ff.append(sample_quota(df_ff,  per_race_ff[r]))
    picked_utk.append(sample_quota(df_utk, per_race_utk[r]))

subset = pd.concat(picked_ff + picked_utk, ignore_index=True).drop_duplicates('image_id')
print('Subset after initial sampling:', subset.shape)

## 6) Top-up if short overall or per-source

In [None]:
def top_up(source_name, need, already):
    if need <= 0:
        return pd.DataFrame(columns=already.columns)
    pool_src = pool[pool.source_dataset==source_name]
    # Avoid items already in subset by image_id
    pool_src = pool_src[~pool_src['image_id'].isin(already.index)]
    pool_src = pool_src.sample(frac=1.0, random_state=SEED)  # shuffle
    take = min(need, len(pool_src))
    return pool_src.iloc[:take].copy()

def count_source(df, name):
    return (df.source_dataset==name).sum()

N_ff  = int(TARGET_TOTAL * FAIRFACE_RATIO)
N_utk = TARGET_TOTAL - N_ff

need_ff  = N_ff  - count_source(subset, 'FairFace')
need_utk = N_utk - count_source(subset, 'UTKFace')

if need_ff > 0:
    subset = pd.concat([
        subset,
        top_up('FairFace', need_ff, subset.set_index('image_id'))
    ], ignore_index=True).drop_duplicates('image_id')

if need_utk > 0:
    subset = pd.concat([
        subset,
        top_up('UTKFace', need_utk, subset.set_index('image_id'))
    ], ignore_index=True).drop_duplicates('image_id')

# Fill to TARGET_TOTAL if still short
still_short = TARGET_TOTAL - len(subset)
if still_short > 0:
    remain = pool[~pool['image_id'].isin(subset['image_id'])]
    fill = remain.sample(min(still_short, len(remain)), random_state=SEED)
    subset = pd.concat([subset, fill], ignore_index=True).drop_duplicates('image_id')

# Trim if overshot
if len(subset) > TARGET_TOTAL:
    subset = subset.sample(n=TARGET_TOTAL, random_state=SEED)

def count_source(df, name): return (df.source_dataset==name).sum()
ff_ct  = count_source(subset, 'FairFace')
utk_ct = count_source(subset, 'UTKFace')
print(f'✅ Subset size: {len(subset)}  | FairFace: {ff_ct}  | UTKFace: {utk_ct}')

## 7) Train/Val/Test stratified split (by race) and save

In [None]:
train, temp = train_test_split(subset, test_size=0.30, random_state=SEED, stratify=subset['race_cat'])
val,   test = train_test_split(temp,   test_size=0.50, random_state=SEED, stratify=temp['race_cat'])

def tag(df, name):
    df=df.copy(); df['split']=name; return df

master = pd.concat([tag(train,'train'), tag(val,'val'), tag(test,'test')], ignore_index=True)
out_path = 'data/splits/master.csv'
master.to_csv(out_path, index=False)
print('✅', out_path, 'written.')
master.head()

## 8) Diagnostics

In [None]:
print('\nCounts by source in master:')
print(master['source_dataset'].value_counts())

print('\nCounts by race in master:')
print(master['race_cat'].value_counts())

print('\nRace × Split counts:')
print(master.pivot_table(index='race_cat', columns='split', values='image_id', aggfunc='count').fillna(0).astype(int))