## 1. Balance Age groups in YLFW (undersampling)

In [7]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)


### 1.1 Young Group

In [8]:
y_df = pd.read_csv('../data/raw_full/YLFW/raw_ylfw_df.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw_full/YLFW/raw_ylfw_df.csv'

In [52]:
y_df[y_df.children_agegroup == "7-9"].groupby("im_id").agg({'img_name': ['count']}).describe()

Unnamed: 0_level_0,img_name
Unnamed: 0_level_1,count
count,955.0
mean,1.732984
std,1.13682
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,11.0


In [3]:
len(y_df)

9229

In [4]:
y_df.Age.describe()

count    9229.000000
mean        8.412829
std         7.504623
min         0.000000
25%         3.000000
50%         6.000000
75%        11.000000
max        42.000000
Name: Age, dtype: float64

In [5]:
y_df["children_agegroup"].value_counts()

children_agegroup
0-3      2665
4-6      2068
7-9      1655
10-12     923
13-15     459
16-18     383
Name: count, dtype: int64

### 1.2 Adult Group

In [6]:
a_df = pd.read_csv('../data/raw_full/RLFW/raw_rfw_df.csv')

In [7]:
len(a_df)

40411

In [8]:
a_df.Age.value_counts()

Age
26    1638
27    1622
25    1609
28    1505
24    1476
      ... 
2        2
3        2
85       2
88       1
90       1
Name: count, Length: 87, dtype: int64

In [9]:
# Add childrens age group to adult data
# Ensure 'Age' is of type integer
a_df['Age'] = a_df['Age'].astype(int)

# Define the age groups and corresponding labels
age_bins = [0, 4, 7, 10, 13, 16, 19]  # adding one more bin for 16-18
age_labels = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18']

# Update the DataFrame with age groupings
a_df['children_agegroup'] = pd.cut(a_df['Age'], bins=age_bins, labels=age_labels, right=False)
a_df["children_agegroup"].value_counts()


children_agegroup
16-18    452
13-15     92
10-12     82
7-9       44
4-6       24
0-3        4
Name: count, dtype: int64

In [None]:
a_df["children_agegroup"].isnsa

### 3. Transfer childrens age groups in a_df to y_df

In [10]:
a_data_children_agegroup = a_df[a_df.children_agegroup.notnull()]

In [63]:
a_data_children_agegroup.groupby("im_id").agg({'img_name': ['count']}).value_counts()

(img_name, count)
1                    263
2                     81
3                     49
4                     26
5                      3
7                      1
Name: count, dtype: int64

In [12]:
# Merge to ylfw
ylfw_witha = pd.concat([y_df,a_data_children_agegroup],ignore_index=True)
ylfw_witha[-10:]

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,img_org_name,face_id
9917,m.02qpxkm,18,Asian,02qpxkm,16-18,02qpxkm,m.02qpxkm_0004,4.0
9918,m.01m49_z,14,Asian,01m49_z,13-15,01m49_z,m.01m49_z_0001,1.0
9919,m.0sgqn7h,9,Asian,0sgqn7h,7-9,0sgqn7h,m.0sgqn7h_0003,3.0
9920,m.0sgqn7h,17,Asian,0sgqn7h,16-18,0sgqn7h,m.0sgqn7h_0002,2.0
9921,m.0knwxrc,16,Asian,0knwxrc,16-18,0knwxrc,m.0knwxrc_0001,1.0
9922,m.0d5060,18,Asian,0d5060,16-18,0d5060,m.0d5060_0004,4.0
9923,m.0d5060,18,Asian,0d5060,16-18,0d5060,m.0d5060_0002,2.0
9924,m.05mz42q,18,Asian,05mz42q,16-18,05mz42q,m.05mz42q_0002,2.0
9925,m.0rpfv5x,17,Asian,0rpfv5x,16-18,0rpfv5x,m.0rpfv5x_0004,4.0
9926,m.048t9m,18,Asian,048t9m,16-18,048t9m,m.048t9m_0002,2.0


### 4. Undersample based on the minority class in the children's age group in YLFW
- Keep racial distribution as in minority class
- Delete based on IDs 

Distribution of age groups - based on IDS (obs be aware that each id have images in different age groups). Therefore we sample based on number of images per age group. 


In [13]:
# Standard deviation of age groups per identity
ylfw_witha.groupby("im_id").agg({'Age': ['min', 'max','std', 'count']}).sort_values(by=('Age', 'std'), ascending=False).head(5)

Unnamed: 0_level_0,Age,Age,Age,Age
Unnamed: 0_level_1,min,max,std,count
im_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Indian_87,0,30,21.213203,2
Asian_620,2,29,19.091883,2
Asian_570,2,35,18.502252,3
Asian_96,0,26,18.384776,2
African_44,6,32,18.384776,2


In [14]:
# Distribution of IDs per age group
ylfw_witha.groupby('children_agegroup').im_id.nunique().sort_values(ascending=False)

children_agegroup
4-6      1169
0-3      1123
7-9       987
10-12     670
16-18     650
13-15     428
Name: im_id, dtype: int64

In [15]:
# Distribution of age groups - based on number of images per age group
ylfw_witha.children_agegroup.value_counts().sort_values(ascending=False)

children_agegroup
0-3      2669
4-6      2092
7-9      1699
10-12    1005
16-18     835
13-15     551
Name: count, dtype: int64

In [16]:
# Take minority age group BASED ON N_IMAGES
min_agegroup = ylfw_witha.groupby('children_agegroup').img_name.count().sort_values(ascending=False).idxmin()
print(min_agegroup)

13-15


In [17]:
# Get racial distribution of number of images in minority age group
minority_agedf = ylfw_witha[ylfw_witha.children_agegroup == min_agegroup]
minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)

ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

In [18]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,img_org_name,face_id
22,African_634_22,15,African,634,13-15,African_634,,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,,
62,African_699_1,13,African,699,13-15,African_699,,
164,African_412_47,13,African,412,13-15,African_412,,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,,
...,...,...,...,...,...,...,...,...
9898,m.05lq0j,14,Asian,05lq0j,13-15,05lq0j,m.05lq0j_0003,3.0
9907,m.05fwlf,13,Asian,05fwlf,13-15,05fwlf,m.05fwlf_0002,2.0
9914,m.0bmd0_6,15,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0002,2.0
9916,m.0bmd0_6,13,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0001,1.0


I.e. Remove samples in the other age groups such that they have approximately the same amount of images and same racial distribution. This is done by sampling the same amount of images within each ethnicity group. 

In [19]:
print("min age group: ", min_agegroup, "\nnumber of images: ", len(minority_agedf),
      "\n\nracial distribution:", minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False))

min age group:  13-15 
number of images:  551 

racial distribution: ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64


OBS inden dette: lav ethnicity dist for hver age group, for måske også og se hvilken der er mest balanced, og så balancér ift denne?? eller er det too much? 

In [20]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,img_org_name,face_id
22,African_634_22,15,African,634,13-15,African_634,,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,,
62,African_699_1,13,African,699,13-15,African_699,,
164,African_412_47,13,African,412,13-15,African_412,,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,,
...,...,...,...,...,...,...,...,...
9898,m.05lq0j,14,Asian,05lq0j,13-15,05lq0j,m.05lq0j_0003,3.0
9907,m.05fwlf,13,Asian,05fwlf,13-15,05fwlf,m.05fwlf_0002,2.0
9914,m.0bmd0_6,15,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0002,2.0
9916,m.0bmd0_6,13,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0001,1.0


In [21]:
# get dist of images per ID in min group
minority_agedf.groupby("im_id").agg({'img_name': ['count']})

Unnamed: 0_level_0,img_name
Unnamed: 0_level_1,count
im_id,Unnamed: 1_level_2
01m49_z,1
01r4447,2
01vjy69,1
025wqb8,1
0264t6l,1
...,...
Indian_7,1
Indian_711,1
Indian_717,2
Indian_74,1


In [22]:
print("Average number of ims per id")
minority_agedf.groupby("im_id").agg({'img_name': ['count']}).mean()

Average number of ims per id


img_name  count    1.287383
dtype: float64

In [23]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,img_org_name,face_id
22,African_634_22,15,African,634,13-15,African_634,,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,,
62,African_699_1,13,African,699,13-15,African_699,,
164,African_412_47,13,African,412,13-15,African_412,,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,,
...,...,...,...,...,...,...,...,...
9898,m.05lq0j,14,Asian,05lq0j,13-15,05lq0j,m.05lq0j_0003,3.0
9907,m.05fwlf,13,Asian,05fwlf,13-15,05fwlf,m.05fwlf_0002,2.0
9914,m.0bmd0_6,15,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0002,2.0
9916,m.0bmd0_6,13,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0001,1.0


In [24]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,img_org_name,face_id
22,African_634_22,15,African,634,13-15,African_634,,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,,
62,African_699_1,13,African,699,13-15,African_699,,
164,African_412_47,13,African,412,13-15,African_412,,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,,
...,...,...,...,...,...,...,...,...
9898,m.05lq0j,14,Asian,05lq0j,13-15,05lq0j,m.05lq0j_0003,3.0
9907,m.05fwlf,13,Asian,05fwlf,13-15,05fwlf,m.05fwlf_0002,2.0
9914,m.0bmd0_6,15,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0002,2.0
9916,m.0bmd0_6,13,Asian,0bmd0_6,13-15,0bmd0_6,m.0bmd0_6_0001,1.0


In [25]:
etnicities = list(ylfw_witha.ethnicity.unique())

# Save variables of frequency within each ethnicity
minority_etnicity_dist = minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)
minority_n_ids = len(minority_agedf) # kommer automatisk at nå ca samme fordeling


# Get this distribution of frequency in ethnicities from each age group
ylfw_witha_balanced = minority_agedf.copy()
for agegroup in age_labels:
    if agegroup != min_agegroup:
        agegroup_df = ylfw_witha[ylfw_witha.children_agegroup == agegroup]
        nunique_ids = agegroup_df.im_id.nunique()
        #get number of ids based on percentage dist of the minority age group
        for e in etnicities:
            freq_e = minority_etnicity_dist[e]

            # randomly sample the images from the current age group from this ethnicity
            age_ethn_df = agegroup_df[agegroup_df.ethnicity == e]
            sample_eids = age_ethn_df.sample(n=freq_e, random_state=42)

            # add theese to the minority dataset
            ylfw_witha_balanced = pd.concat([ylfw_witha_balanced, sample_eids], ignore_index=True)


Check if balanced

In [26]:
# Age groups
ylfw_witha_balanced.children_agegroup.value_counts()

children_agegroup
13-15    551
0-3      551
4-6      551
7-9      551
10-12    551
16-18    551
Name: count, dtype: int64

In [89]:
# Race distribution
print("Min age group distribution", minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False))
print("\n0-3\n",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "0-3"].groupby('ethnicity').img_name.count().sort_values(ascending=False))
print("\n16-18",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "16-18"].groupby('ethnicity').img_name.count().sort_values(ascending=False))

Min age group distribution ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

0-3
 ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

16-18 ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64


In [88]:
ylfw_witha_balanced.children_agegroup.value_counts()

children_agegroup
13-15    551
0-3      551
4-6      551
7-9      551
10-12    551
16-18    551
Name: count, dtype: int64

In [90]:
ylfw_witha_balanced.to_csv('../master_thesis/data/child_balanced.csv', index=False)

## 2. Balance Data in adult group and children's group


Mht race?

In [28]:
# Racial distribution
ylfw_witha_balanced.ethnicity.value_counts(normalize=True)

ethnicity
African      0.297641
Indian       0.286751
Caucasian    0.212341
Asian        0.203267
Name: proportion, dtype: float64

In [29]:
# Remove childrens age group from adult data
a_df = pd.read_csv('../data/raw_full/RLFW/raw_rfw_df.csv')
a_df = a_df[a_df.Age > 18]

# Check racial distribution
a_df.ethnicity.value_counts(normalize=True)

ethnicity
African      0.256415
Indian       0.256062
Caucasian    0.250296
Asian        0.237227
Name: proportion, dtype: float64

In [84]:
etnicities = list(ylfw_witha_balanced.ethnicity.unique())

# Save variables of frequency within each ethnicity
etnicity_dist = ylfw_witha_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)
a_balanced = pd.DataFrame()
for e in etnicities:
    freq_e = etnicity_dist[e]

    # randomly sample the images from the current age group from this ethnicity
    age_ethn_df = a_df[a_df.ethnicity == e]
    sample_eids = age_ethn_df.sample(n=freq_e, random_state=42)

    # add theese to the minority dataset
    a_balanced = pd.concat([a_balanced, sample_eids], ignore_index=True)


In [85]:
a_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)

ethnicity
African      984
Indian       948
Caucasian    702
Asian        672
Name: img_name, dtype: int64

In [86]:
ylfw_witha_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)


ethnicity
African      984
Indian       948
Caucasian    702
Asian        672
Name: img_name, dtype: int64

In [91]:
a_balanced.to_csv('../master_thesis/data/adults_balanced.csv', index=False)