In [1]:
import os, pandas as pd
from PIL import Image

In [2]:
utk_dir = "../data/UTKFace/UTKFace"
utk_records = []
race_map = {0:'White', 1:'Black', 2:'EastAsian', 3:'SouthAsian', 4:'Other'}
gender_map = {0:'Male', 1:'Female'}

In [3]:
for f in os.listdir(utk_dir):
    if not f.endswith(".jpg"): continue
    parts = f.split('_')
    if len(parts) < 4: continue
    try:
        age, gender, race = int(parts[0]), int(parts[1]), int(parts[2])
    except ValueError: 
        continue
    race_cat, gender_cat = race_map.get(race), gender_map.get(gender)
    try:
        with Image.open(os.path.join(utk_dir,f)) as im:
            w,h = im.size
    except: w=h=None
    utk_records.append(dict(
        image_id=f"UTK_{os.path.splitext(f)[0]}",
        source_dataset="UTKFace",
        rel_path=f"UTKFace/{f}",
        race_cat=race_cat,
        gender_cat=gender_cat,
        age_num=age,
        width=w, height=h
    ))

utk_df = pd.DataFrame(utk_records)
utk_df.head()

Unnamed: 0,image_id,source_dataset,rel_path,race_cat,gender_cat,age_num,width,height
0,UTK_9_1_2_20161219204347420.jpg.chip,UTKFace,UTKFace/9_1_2_20161219204347420.jpg.chip.jpg,EastAsian,Female,9,200,200
1,UTK_36_0_1_20170117163203851.jpg.chip,UTKFace,UTKFace/36_0_1_20170117163203851.jpg.chip.jpg,Black,Male,36,200,200
2,UTK_86_1_0_20170120225751953.jpg.chip,UTKFace,UTKFace/86_1_0_20170120225751953.jpg.chip.jpg,White,Female,86,200,200
3,UTK_26_1_0_20170116171048641.jpg.chip,UTKFace,UTKFace/26_1_0_20170116171048641.jpg.chip.jpg,White,Female,26,200,200
4,UTK_1_1_2_20161219154612988.jpg.chip,UTKFace,UTKFace/1_1_2_20161219154612988.jpg.chip.jpg,EastAsian,Female,1,200,200


In [4]:
csvs = ["../data/FairFace/train_labels.csv",
         "../data/FairFace/val_labels.csv"]
ff = pd.concat([pd.read_csv(c) for c in csvs], ignore_index=True)
ff.rename(columns={'file':'rel_path','age':'age_cat','gender':'gender_cat','race':'race_cat'}, inplace=True)
ff['source_dataset'] = 'FairFace'
ff['image_id'] = ff['rel_path'].apply(lambda x: 'FF_' + os.path.splitext(os.path.basename(x))[0])
ff['rel_path'] = ff['rel_path'].apply(lambda x: f'FairFace/{x}')
ff['age_num'] = None; ff['width']=None; ff['height']=None


In [5]:
pool = pd.concat([utk_df, ff], ignore_index=True)
pool.to_csv("../data/pool.csv", index=False)
print("✅ data/pool.csv written with", len(pool), "rows.")

✅ data/pool.csv written with 121403 rows.
