# Dataset Subset Builder
_Auto-generated on 2025-11-08 05:49:01_

This notebook loads a combined metadata pool (`data/pool.csv`) for UTKFace and FairFace, normalizes label taxonomies, applies simple quality filters, and samples a stratified subset with frozen train/val/test splits.

## Requirements & Files
- Input: `data/pool.csv` with columns:
  `image_id, source_dataset, rel_path, race_cat, gender_cat, age_num, width, height`
- Output: `data/splits/master.csv` with the final split assignments.

Adjust paths as needed for your environment.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


## 1) Load combined pool

In [None]:
# Expected cols: image_id, source_dataset, rel_path, race_cat, gender_cat, age_num, width, height
pool = pd.read_csv('data/pool.csv')
print('Loaded pool with shape:', pool.shape)
pool.head()

## 2) Normalize labels (unify race taxonomy)

In [None]:
race_map = {
    # map UTKFace/FairFace labels into a single taxonomy
    'Asian':'EastAsian', 'Indian':'SouthAsian', 'SEAsian':'SoutheastAsian',
    'Middle Eastern':'MiddleEastern', 'Latino_Hispanic':'Latino',
    # include identity mapping for those already matching
}
pool['race_cat'] = pool['race_cat'].replace(race_map)

# Age bins
bins = [-1, 17, 29, 44, 59, 200]
labels = ['0-17','18-29','30-44','45-59','60+']
pool['age_bin'] = pd.cut(pool['age_num'], bins=bins, labels=labels)
pool[['race_cat','gender_cat','age_num','age_bin']].head()

## 3) Quality filter (min resolution 128×128)

In [None]:
before = len(pool)
pool = pool[(pool['width']>=128) & (pool['height']>=128)].copy()
after = len(pool)
print(f'Filtered from {before} to {after} rows (min 128×128).')

## 4) Quotas & stratified sampling by race → gender → age_bin

In [None]:
target_total = 900
races = ['White','Black','EastAsian','SouthAsian','SoutheastAsian','MiddleEastern','Latino']
per_race = int(target_total / len(races))  # ~128 each

def sample_strata(df_race, per_race, seed=42):
    # try to split by gender first, then age_bin. Adjust if missing.
    if 'gender_cat' in df_race and df_race['gender_cat'].notna().any():
        genders = [g for g in ['Male','Female'] if g in df_race['gender_cat'].unique()]
    else:
        genders = [None]

    # initial target per gender
    per_gender = int(np.ceil(per_race / max(len(genders),1)))
    out = []

    for g in genders:
        sub_g = df_race if g is None else df_race[df_race['gender_cat']==g]
        if sub_g.empty:
            continue

        # split remaining across age bins
        bins_present = sub_g['age_bin'].dropna().unique().tolist()
        if not bins_present:
            take = min(per_gender, len(sub_g))
            out.append(sub_g.sample(take, random_state=seed))
            continue

        per_bin = max(1, int(np.floor(per_gender / len(bins_present))))
        taken = []
        for ab in bins_present:
            cand = sub_g[sub_g['age_bin']==ab]
            if cand.empty:
                continue
            take = min(per_bin, len(cand))
            taken.append(cand.sample(take, random_state=seed))

        # If we’re short, top-up from remaining in this gender
        picked = pd.concat(taken) if taken else sub_g.head(0)
        short = per_gender - len(picked)
        if short > 0:
            remain = sub_g.drop(picked.index)
            extra_take = min(short, len(remain))
            if extra_take > 0:
                picked = pd.concat([picked, remain.sample(extra_take, random_state=seed)])
        out.append(picked)

    picked_race = pd.concat(out) if out else df_race.head(0)

    # If still short for this race, top-up within race
    short_race = per_race - len(picked_race)
    if short_race > 0:
        remain_race = df_race.drop(picked_race.index)
        extra_take = min(short_race, len(remain_race))
        if extra_take > 0:
            picked_race = pd.concat([picked_race, remain_race.sample(extra_take, random_state=seed)])

    # If overshoot, downsample
    if len(picked_race) > per_race:
        picked_race = picked_race.sample(per_race, random_state=seed)

    return picked_race

batches = []
for r in races:
    df_r = pool[pool['race_cat']==r]
    if df_r.empty:
        print(f'[WARN] No rows for race {r}; skipping.')
        continue
    batches.append(sample_strata(df_r, per_race, seed=42))

subset = pd.concat(batches).drop_duplicates(subset=['image_id']).reset_index(drop=True)
print('Subset shape after race-based sampling:', subset.shape)


## 5) Top-up if total < target (fill from leftovers)

In [None]:
if len(subset) < target_total:
    need = target_total - len(subset)
    leftovers = pool.drop(subset.index, errors='ignore')
    fill = leftovers.sample(min(need, len(leftovers)), random_state=42)
    subset = pd.concat([subset, fill]).drop_duplicates(subset=['image_id']).reset_index(drop=True)
print('Final subset size:', len(subset))

## 6) Train/Val/Test split (stratify by race where possible)

In [None]:
train, temp = train_test_split(subset, test_size=0.30, random_state=42, stratify=subset['race_cat'])
val, test = train_test_split(temp, test_size=0.50, random_state=42, stratify=temp['race_cat'])

def tag(df, name):
    df = df.copy()
    df['split'] = name
    return df

master = pd.concat([tag(train,'train'), tag(val,'val'), tag(test,'test')], ignore_index=True)
master_path = 'data/splits/master.csv'
master[['image_id','source_dataset','rel_path','race_cat','gender_cat','age_num','age_bin','split']].to_csv(master_path, index=False)
print('Saved:', master_path)
master.head()

## 7) Quick diagnostics

In [None]:
print('\nCounts by split:')
print(master['split'].value_counts())

print('\nCounts by race within split:')
print(master.pivot_table(index='race_cat', columns='split', values='image_id', aggfunc='count').fillna(0).astype(int))
