# Build Combined Dataset Pool
_Auto-generated on 2025-11-08 05:52:19_

This notebook combines UTKFace and FairFace datasets into a single unified metadata CSV file (`data/pool.csv`). It extracts image attributes from filenames and FairFace CSVs, and ensures consistent column structure across datasets.

## Imports and Setup

In [1]:
import os
import pandas as pd
from PIL import Image


## 1) Process UTKFace Dataset

In [2]:
utk_dir = '../Data/UTKFace'
utk_records = []
race_map = {0:'White', 1:'Black', 2:'EastAsian', 3:'SouthAsian', 4:'Other'}
gender_map = {0:'Male', 1:'Female'}

for f in os.listdir(utk_dir):
    if not f.endswith('.jpg'): continue
    parts = f.split('_')
    if len(parts) < 4: continue
    try:
        age, gender, race = int(parts[0]), int(parts[1]), int(parts[2])
    except ValueError:
        continue
    race_cat, gender_cat = race_map.get(race), gender_map.get(gender)
    try:
        with Image.open(os.path.join(utk_dir,f)) as im:
            w,h = im.size
    except:
        w=h=None
    utk_records.append(dict(
        image_id=f'UTK_{os.path.splitext(f)[0]}',
        source_dataset='UTKFace',
        rel_path=f'UTKFace/{f}',
        race_cat=race_cat,
        gender_cat=gender_cat,
        age_num=age,
        width=w, height=h
    ))

utk_df = pd.DataFrame(utk_records)
print('UTKFace records:', len(utk_df))
utk_df.head()

UTKFace records: 0


## 2) Process FairFace Dataset

In [3]:
csvs = [
    '../Data/FairFace/train_labels.csv',
    '../Data/FairFace/val_labels.csv',
]

ff = pd.concat([pd.read_csv(c) for c in csvs], ignore_index=True)
ff.rename(columns={'file':'rel_path','age':'age_cat','gender':'gender_cat','race':'race_cat'}, inplace=True)
ff['source_dataset'] = 'FairFace'
ff['image_id'] = ff['rel_path'].apply(lambda x: 'FF_' + os.path.splitext(os.path.basename(x))[0])
ff['rel_path'] = ff['rel_path'].apply(lambda x: f'FairFace/{x}')
print('FairFace records:', len(ff))
ff.head()

FairFace records: 97698


Unnamed: 0,rel_path,age_cat,gender_cat,race_cat,service_test,source_dataset,image_id
0,FairFace/train/1.jpg,50-59,Male,East Asian,True,FairFace,FF_1
1,FairFace/train/2.jpg,30-39,Female,Indian,False,FairFace,FF_2
2,FairFace/train/3.jpg,3-9,Female,Black,False,FairFace,FF_3
3,FairFace/train/4.jpg,20-29,Female,Indian,True,FairFace,FF_4
4,FairFace/train/5.jpg,20-29,Female,Indian,True,FairFace,FF_5


## 3) Combine and Save Pool

In [4]:
pool = pd.concat([utk_df, ff], ignore_index=True)
pool.to_csv('../Data/pool.csv', index=False)
print(f'✅ data/pool.csv written with {len(pool)} rows.')
pool.sample(5)

✅ data/pool.csv written with 97698 rows.


Unnamed: 0,rel_path,age_cat,gender_cat,race_cat,service_test,source_dataset,image_id
44031,FairFace/train/44032.jpg,30-39,Male,Latino_Hispanic,False,FairFace,FF_44032
32222,FairFace/train/32223.jpg,3-9,Male,Black,False,FairFace,FF_32223
8487,FairFace/train/8488.jpg,20-29,Female,White,False,FairFace,FF_8488
13446,FairFace/train/13447.jpg,20-29,Female,Latino_Hispanic,True,FairFace,FF_13447
56397,FairFace/train/56398.jpg,10-19,Female,Middle Eastern,True,FairFace,FF_56398
