## 1. Balance Age groups in YLFW (undersampling)

In [165]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)


### 1.1 Young Group

In [166]:
y_df = pd.read_csv('../data/raw/YLFW_bench/raw_ylfw_df.csv') 

In [167]:
len(y_df)

34

In [168]:
y_df.Age.describe()

count    34.000000
mean      5.558824
std       4.300748
min       0.000000
25%       2.250000
50%       4.000000
75%       8.750000
max      18.000000
Name: Age, dtype: float64

In [169]:
y_df["children_agegroup"].value_counts() 

children_agegroup
0-3      13
4-6      10
7-9       5
10-12     3
13-15     2
16-18     1
Name: count, dtype: int64

### 1.2 Adult Group

In [170]:
a_df = pd.read_csv('../data/raw/RLFW_mini/raw_rfw_df.csv') 

In [171]:
len(a_df)

9

In [172]:
a_df.Age.value_counts() 

Age
45    3
47    3
50    3
Name: count, dtype: int64

In [173]:
# simulated data

# Define the age groups and corresponding labels
age_bins = [0, 45, 46, 47, 48, 49, 50]  # adding one more bin for 16-18
age_labels = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18']

# Update the DataFrame with age groupings
a_df['children_agegroup'] = pd.cut(a_df['Age'], bins=age_bins, labels=age_labels, right=False)


a_df["children_agegroup"].value_counts() 


children_agegroup
4-6      3
10-12    3
0-3      0
7-9      0
13-15    0
16-18    0
Name: count, dtype: int64

### 3. Transfer childrens age groups in a_df to y_df

In [174]:
a_df.children_agegroup

0      4-6
1      4-6
2      4-6
3    10-12
4    10-12
5    10-12
6      NaN
7      NaN
8      NaN
Name: children_agegroup, dtype: category
Categories (6, object): ['0-3' < '4-6' < '7-9' < '10-12' < '13-15' < '16-18']

In [175]:
a_data_children_agegroup = a_df[a_df.children_agegroup.notnull()]

In [176]:
# Merge to ylfw
ylfw_witha = pd.concat([y_df,a_data_children_agegroup],ignore_index=True)
ylfw_witha[-10:]

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
30,Asian_14_84,2,Asian,14,0-3,Asian_14,
31,Asian_14_19,4,Asian,14,4-6,Asian_14,
32,Caucasian_204_27,5,Caucasian,204,4-6,Caucasian_204,
33,Indian_80_1,5,Indian,80,4-6,Indian_80,
34,m.0b0h05,45,Asian,0b0h05,4-6,0b0h05,2.0
35,m.0b0h05,45,Asian,0b0h05,4-6,0b0h05,3.0
36,m.0b0h05,45,Asian,0b0h05,4-6,0b0h05,1.0
37,m.0b0h05,47,Asian,0b0h05,10-12,0b0h05,2.0
38,m.0b0h05,47,Asian,0b0h05,10-12,0b0h05,3.0
39,m.0b0h05,47,Asian,0b0h05,10-12,0b0h05,1.0


### 4. Undersample based on the minority class in the children's age group in YLFW
- Keep racial distribution as in minority class
- Delete based on IDs 

Distribution of age groups - based on IDS (obs be aware that each id have images in different age groups). Therefore we sample based on number of images per age group. 


In [187]:
# Standard deviation of age groups per identity
ylfw_witha.groupby("im_id").agg({'Age': ['min', 'max','std', 'count']}).sort_values(by=('Age', 'std'), ascending=False).head(5)

Unnamed: 0_level_0,Age,Age,Age,Age
Unnamed: 0_level_1,min,max,std,count
im_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
African_0,7,18,7.778175,2
African_1,2,5,2.12132,2
Caucasian_204,1,5,2.04939,5
Asian_386,8,13,1.690309,8
Caucasian_386,2,4,1.154701,4


In [177]:
# Distribution of IDs per age group
ylfw_witha.groupby('children_agegroup').im_id.nunique().sort_values(ascending=False)

children_agegroup
4-6      7
0-3      6
10-12    2
13-15    2
7-9      2
16-18    1
Name: im_id, dtype: int64

In [178]:
# Distribution of age groups - based on number of images per age group
ylfw_witha.children_agegroup.value_counts()

children_agegroup
0-3      13
4-6      13
10-12     6
7-9       5
13-15     2
16-18     1
Name: count, dtype: int64

In [191]:
ylfw_witha.groupby('children_agegroup').img_name.count().sort_values(ascending=False)

children_agegroup
0-3      13
4-6      13
10-12     6
7-9       5
13-15     2
16-18     1
Name: img_name, dtype: int64

In [193]:
# Take minority age group BASED ON N_IMAGES
min_agegroup = ylfw_witha.groupby('children_agegroup').img_name.count().sort_values(ascending=False).idxmin()
print(min_agegroup)

16-18


In [196]:
### ONLY FOR MINI VERSION
synth_min_agegroup = ylfw_witha.children_agegroup.value_counts().idxmax()
print("synthetic min age group", synth_min_agegroup)

# Get racial distribution of number of images in minority age group
minority_agedf = ylfw_witha[ylfw_witha.children_agegroup == synth_min_agegroup]
minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)

synthetic min age group 0-3


ethnicity
Caucasian    7
Asian        4
African      1
Indian       1
Name: img_name, dtype: int64

I.e. Remove samples in the other age groups such that they have approximately the same amount of images and same racial distribution. This is done by sampling the same amount of images within each ethnicity group. 

In [197]:
print("min age group: ", min_agegroup, "\nnumber of images: ", len(minority_agedf), 
      "\n\nracial distribution:", minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False))

min age group:  16-18 
number of images:  13 

racial distribution: ethnicity
Caucasian    7
Asian        4
African      1
Indian       1
Name: img_name, dtype: int64


OBS inden dette: lav ethnicity dist for hver age group, for måske også og se hvilken der er mest balanced, og så balancér ift denne?? eller er det too much? 

In [203]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
0,Caucasian_604_4,1,Caucasian,604,0-3,Caucasian_604,
4,Caucasian_204_1,1,Caucasian,204,0-3,Caucasian_204,
5,Asian_14_12,3,Asian,14,0-3,Asian_14,
6,Caucasian_386_9,2,Caucasian,386,0-3,Caucasian_386,
8,Indian_98_7,3,Indian,98,0-3,Indian_98,
12,Caucasian_204_25,1,Caucasian,204,0-3,Caucasian_204,
14,Asian_14_33,3,Asian,14,0-3,Asian_14,
18,Caucasian_604_15,1,Caucasian,604,0-3,Caucasian_604,
19,Asian_14_6,3,Asian,14,0-3,Asian_14,
20,Caucasian_604_20,0,Caucasian,604,0-3,Caucasian_604,


In [205]:
# get dist of images per ID in min group
minority_agedf.groupby("im_id").agg({'img_name': ['count']})

Unnamed: 0_level_0,img_name
Unnamed: 0_level_1,count
im_id,Unnamed: 1_level_2
African_1,1
Asian_14,4
Caucasian_204,2
Caucasian_386,2
Caucasian_604,3
Indian_98,1


In [206]:
print("Average number of ims per id")
minority_agedf.groupby("im_id").agg({'img_name': ['count']}).mean()

img_name  count    2.166667
dtype: float64

In [184]:
etnicities = list(ylfw_witha.ethnicity.unique())

minority_agedf = minority_agedf #REAL
minority_agedf = ylfw_witha[ylfw_witha.children_agegroup == "4-6"] #WHEN SIMULATING

# Save variables of frequency within each ethnicity
minority_etnicity_dist = minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)
minority_n_ids = len(minority_agedf) # kommer automatisk at nå ca samme fordeling


# Get this distribution of frequency in ethnicities from each age group
ylfw_witha_balanced = minority_agedf.copy()
for agegroup in ylfw_witha.children_agegroup.unique():
    if agegroup != synth_min_agegroup:
        agegroup_df = ylfw_witha[ylfw_witha.children_agegroup == agegroup]
        nunique_ids = agegroup_df.im_id.nunique()
        #get number of ids based on percentage dist of the minority age group
        for e in etnicities:
            freq_e = minority_etnicity_dist[e]
                        
            # randomly sample the images from the current age group from this ethnicity 
            age_ethn_df = agegroup_df[agegroup_df.ethnicity == e]
            sample_eids = age_ethn_df.sample(n=freq_e, random_state=42)
            
            # add theese to the minority dataset
            ylfw_witha_balanced = pd.concat([ylfw_witha_balanced, agegroup_df], ignore_index=True)
    
        
 

## 2. Balance Data in adult group and children's group


OBS: Også mht race?

In [185]:
# lets say theres 20 ids in young age group

n_young_sample_ids = ylfw_witha_balanced.im_id.nunique()

a_df = pd.read_csv('../data/raw/RLFW_mini/raw_rfw_df.csv') 
a_df = a_df[a_df.Age < 18]

# random sample same numebr of ids from the adult dataset
a_ids = a_df.im_id.nunique()
a_sample_ids = random.sample(list(a_ids), n_young_sample_ids)

# new adult's age agroup
sample_eids_df = agegroup_df[agegroup_df.im_id.isin(sample_eids)]

print("N ids in young age group: ", n_young_sample_ids, "\nN ids in adult age group: ", len(a_sample_ids))





TypeError: 'int' object is not iterable