# Age and Gender classification
## Label Alignment and Merging Datasets

In [1]:
import os 
import sys
sys.path
sys.path.append('..')
import pandas as pd
from utils import gender_labels_UTK, race_labels_UTK, age_group_labels_UTK, race_labels_FairFace, gender_labels_FairFace, age_group_labels_Fairface 
from utils import gather_utkface_data, gather_fairface_data
from utils import map_fairface_to_utk,  map_age_to_group, copy_files

### Label Alignment

In [2]:
print(gender_labels_UTK)
print(gender_labels_FairFace)

{0: 'Male', 1: 'Female'}
{0: 'Male', 1: 'Female'}


In [3]:
print(age_group_labels_UTK)
print(age_group_labels_Fairface)

{0: '0-2', 1: '3-9', 2: '10-19', 3: '20-29', 4: '30-39', 5: '40-49', 6: '50-59', 7: '60-69', 8: '70+'}
{0: '0-2', 1: '3-9', 2: '10-19', 3: '20-29', 4: '30-39', 5: '40-49', 6: '50-59', 7: '60-69', 8: '70+'}


In [4]:
print(race_labels_UTK)
print(race_labels_FairFace)

{0: 'White', 1: 'Black', 2: 'Asian', 3: 'Indian', 4: 'Others'}
{0: 'East Asian', 1: 'Indian', 2: 'Black', 3: 'White', 4: 'Middle Eastern', 5: 'Latino_Hispanic', 6: 'Southeast Asian'}


Loading the data and gathering labels for each dataset.

In [5]:
utk_folder = '../data/cleaned_dataset_UTKFace'
fairface_folder = '../data/FairFace/processed_images'

utk_df = gather_utkface_data(utk_folder)
fairface_df = gather_fairface_data(fairface_folder)

In [6]:
fairface_df['race_mapped'] = fairface_df['race'].apply(map_fairface_to_utk)

In [7]:
utk_df['age_group'] = utk_df['age'].apply(map_age_to_group)

In [8]:
merged_df = pd.concat([utk_df, fairface_df[['file_path', 'gender', 'race_mapped', 'age_group']]], ignore_index=True)

In [9]:
merged_df.sample(10)

Unnamed: 0,file_path,gender,race,age,age_group,race_mapped
82406,../data/FairFace/processed_images\processed_0_...,1,,,3,4.0
58573,../data/FairFace/processed_images\processed_0_...,0,,,5,4.0
108272,../data/FairFace/processed_images\processed_0_...,1,,,4,1.0
68650,../data/FairFace/processed_images\processed_0_...,1,,,6,1.0
76205,../data/FairFace/processed_images\processed_0_...,1,,,3,4.0
3909,../data/cleaned_dataset_UTKFace\23_1_1_2017011...,1,1.0,23.0,3,
38693,../data/FairFace/processed_images\processed_0_...,0,,,0,0.0
85250,../data/FairFace/processed_images\processed_0_...,1,,,3,3.0
30656,../data/FairFace/processed_images\processed_0_...,1,,,2,4.0
97997,../data/FairFace/processed_images\processed_0_...,1,,,4,1.0


In [10]:
merged_df['race'] = merged_df['race'].combine_first(merged_df['race_mapped'])
merged_aligned_df = merged_df[['file_path', 'gender', 'race', 'age_group']]

Cheking for missing values.

In [14]:
na_counts = merged_aligned_df.isna().sum()
print(na_counts)

file_path    0
gender       0
race         0
age_group    0
dtype: int64


In [20]:
merged_aligned_df = merged_aligned_df.copy()

merged_aligned_df['gender'] = merged_aligned_df['gender'].astype(int)
merged_aligned_df['race'] = merged_aligned_df['race'].astype(int)
merged_aligned_df['age_group'] = merged_aligned_df['age_group'].astype(int)

print(merged_aligned_df.dtypes)

file_path    object
gender        int32
race          int32
age_group     int32
dtype: object


In [21]:
print(merged_aligned_df.head())

                                           file_path  gender  race  age_group
0  ../data/cleaned_dataset_UTKFace\100_0_0_201701...       0     0          8
1  ../data/cleaned_dataset_UTKFace\100_0_0_201701...       0     0          8
2  ../data/cleaned_dataset_UTKFace\100_1_0_201701...       1     0          8
3  ../data/cleaned_dataset_UTKFace\100_1_0_201701...       1     0          8
4  ../data/cleaned_dataset_UTKFace\100_1_0_201701...       1     0          8


### Making Copy of Final Merged Dataset

In [22]:
destination_dir = '../data/merged/'
os.makedirs(destination_dir, exist_ok=True)

copy_files(merged_aligned_df, destination_dir)