# Build Combined Dataset Pool
_Auto-generated on 2025-11-08 05:52:19_

This notebook combines UTKFace and FairFace datasets into a single unified metadata CSV file (`data/pool.csv`). It extracts image attributes from filenames and FairFace CSVs, and ensures consistent column structure across datasets.

## Imports and Setup

In [None]:
import os
import pandas as pd
from PIL import Image


## 1) Process UTKFace Dataset

In [None]:
utk_dir = '../Data/UTKFace'
utk_records = []
race_map = {0:'White', 1:'Black', 2:'EastAsian', 3:'SouthAsian', 4:'Other'}
gender_map = {0:'Male', 1:'Female'}

for f in os.listdir(utk_dir):
    if not f.endswith('.jpg'): continue
    parts = f.split('_')
    if len(parts) < 4: continue
    try:
        age, gender, race = int(parts[0]), int(parts[1]), int(parts[2])
    except ValueError:
        continue
    race_cat, gender_cat = race_map.get(race), gender_map.get(gender)
    try:
        with Image.open(os.path.join(utk_dir,f)) as im:
            w,h = im.size
    except:
        w=h=None
    utk_records.append(dict(
        image_id=f'UTK_{os.path.splitext(f)[0]}',
        source_dataset='UTKFace',
        rel_path=f'UTKFace/{f}',
        race_cat=race_cat,
        gender_cat=gender_cat,
        age_num=age,
        width=w, height=h
    ))

utk_df = pd.DataFrame(utk_records)
print('UTKFace records:', len(utk_df))
utk_df.head()

## 2) Process FairFace Dataset

In [None]:
csvs = [
    '../Data/FairFace/fairface_label_train.csv',
    '../Data/FairFace/fairface_label_val.csv',
    '../Data/FairFace/fairface_label_test.csv'
]

ff = pd.concat([pd.read_csv(c) for c in csvs], ignore_index=True)
ff.rename(columns={'file':'rel_path','age':'age_cat','gender':'gender_cat','race':'race_cat'}, inplace=True)
ff['source_dataset'] = 'FairFace'
ff['image_id'] = ff['rel_path'].apply(lambda x: 'FF_' + os.path.splitext(os.path.basename(x))[0])
ff['rel_path'] = ff['rel_path'].apply(lambda x: f'FairFace/{x}')
ff['age_num'] = None
ff['width'] = None
ff['height'] = None
print('FairFace records:', len(ff))
ff.head()

## 3) Combine and Save Pool

In [None]:
pool = pd.concat([utk_df, ff], ignore_index=True)
pool.to_csv('data/pool.csv', index=False)
print(f'âœ… data/pool.csv written with {len(pool)} rows.')
pool.sample(5)