In [1]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)


## 1. Balance Age groups in YLFW (undersampling)

### 1.1 Young Group

In [2]:
y_df = pd.read_csv('../data/raw_full/raw_ylfw_df.csv')

In [3]:
y_df[y_df.children_agegroup == "7-9"].groupby("im_id").agg({'img_name': ['count']}).describe()

Unnamed: 0_level_0,img_name
Unnamed: 0_level_1,count
count,955.0
mean,1.732984
std,1.13682
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,11.0


In [4]:
len(y_df)

9229

In [5]:
y_df.Age.describe()

count    9229.000000
mean        8.412829
std         7.504623
min         0.000000
25%         3.000000
50%         6.000000
75%        11.000000
max        42.000000
Name: Age, dtype: float64

In [6]:
y_df["children_agegroup"].value_counts()

children_agegroup
0-3      2665
4-6      2068
7-9      1655
10-12     923
13-15     459
16-18     383
Name: count, dtype: int64

### 1.2 Adult Group

In [7]:
a_df = pd.read_csv('../data/raw_full/raw_rfw_df.csv')

In [8]:
len(a_df)

40227

In [9]:
a_df

Unnamed: 0,img_name,im_id,ethnicity,id_class,face_id,Age,children_agegroup
0,m.05syl08_0003,m.05syl08,Indian,05syl08,3,64,
1,m.05syl08_0002,m.05syl08,Indian,05syl08,2,62,
2,m.05syl08_0001,m.05syl08,Indian,05syl08,1,64,
3,m.05syl08_0004,m.05syl08,Indian,05syl08,4,61,
4,m.0jzt0s_0002,m.0jzt0s,Indian,0jzt0s,2,53,
...,...,...,...,...,...,...,...
40222,m.07rqxl_0002,m.07rqxl,Asian,07rqxl,2,44,
40223,m.07rqxl_0001,m.07rqxl,Asian,07rqxl,1,49,
40224,m.0d65nf_0002,m.0d65nf,Asian,0d65nf,2,53,
40225,m.0d65nf_0003,m.0d65nf,Asian,0d65nf,3,57,


In [10]:
a_df.Age.value_counts()

Age
26    1630
27    1614
25    1598
28    1496
24    1463
      ... 
2        2
3        2
85       2
88       1
90       1
Name: count, Length: 87, dtype: int64

In [11]:
# Add childrens age group to adult data
# Ensure 'Age' is of type integer
a_df['Age'] = a_df['Age'].astype(int)

# Define the age groups and corresponding labels
age_bins = [0, 4, 7, 10, 13, 16, 19]  # adding one more bin for 16-18
age_labels = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18']

# Update the DataFrame with age groupings
a_df['children_agegroup'] = pd.cut(a_df['Age'], bins=age_bins, labels=age_labels, right=False)
a_df["children_agegroup"].value_counts()


children_agegroup
16-18    452
13-15     92
10-12     82
7-9       43
4-6       24
0-3        4
Name: count, dtype: int64

In [12]:
a_df["children_agegroup"].isna().sum()

39530

### 3. Transfer childrens age groups in a_df to y_df

In [13]:
a_data_children_agegroup = a_df[a_df.children_agegroup.notnull()]

In [14]:
a_data_children_agegroup.groupby("im_id").agg({'img_name': ['count']}).value_counts()

(img_name, count)
1                    263
2                     81
3                     50
4                     25
5                      3
7                      1
Name: count, dtype: int64

In [16]:
# Merge to ylfw
ylfw_witha = pd.concat([y_df,a_data_children_agegroup],ignore_index=True).drop(columns=["face_id", "id_class"])
ylfw_witha[-10:]

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
9916,m.02qpxkm_0004,18,Asian,02qpxkm,16-18,m.02qpxkm,4.0
9917,m.01m49_z_0001,14,Asian,01m49_z,13-15,m.01m49_z,1.0
9918,m.0sgqn7h_0003,9,Asian,0sgqn7h,7-9,m.0sgqn7h,3.0
9919,m.0sgqn7h_0002,17,Asian,0sgqn7h,16-18,m.0sgqn7h,2.0
9920,m.0knwxrc_0001,16,Asian,0knwxrc,16-18,m.0knwxrc,1.0
9921,m.0d5060_0004,18,Asian,0d5060,16-18,m.0d5060,4.0
9922,m.0d5060_0002,18,Asian,0d5060,16-18,m.0d5060,2.0
9923,m.05mz42q_0002,18,Asian,05mz42q,16-18,m.05mz42q,2.0
9924,m.0rpfv5x_0004,17,Asian,0rpfv5x,16-18,m.0rpfv5x,4.0
9925,m.048t9m_0002,18,Asian,048t9m,16-18,m.048t9m,2.0


In [115]:
ylfw_witha

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
0,Caucasian_1008_18,8,Caucasian,1008,7-9,Caucasian_1008,
1,Asian_225_72,11,Asian,225,10-12,Asian_225,
2,Asian_238_1,3,Asian,238,0-3,Asian_238,
3,Caucasian_1110_17,4,Caucasian,1110,4-6,Caucasian_1110,
4,Caucasian_364_12,2,Caucasian,364,0-3,Caucasian_364,
...,...,...,...,...,...,...,...
9921,m.0d5060_0004,18,Asian,0d5060,16-18,m.0d5060,4.0
9922,m.0d5060_0002,18,Asian,0d5060,16-18,m.0d5060,2.0
9923,m.05mz42q_0002,18,Asian,05mz42q,16-18,m.05mz42q,2.0
9924,m.0rpfv5x_0004,17,Asian,0rpfv5x,16-18,m.0rpfv5x,4.0


### 4. Undersample based on the minority class in the children's age group in YLFW
- Keep racial distribution as in minority class
- Delete based on IDs 

Distribution of age groups - based on IDS (obs be aware that each id have images in different age groups). Therefore we sample based on number of images per age group. 


In [116]:
# Standard deviation of age groups per identity
ylfw_witha.groupby("im_id").agg({'Age': ['min', 'max','std', 'count']}).sort_values(by=('Age', 'std'), ascending=False).head(5)

Unnamed: 0_level_0,Age,Age,Age,Age
Unnamed: 0_level_1,min,max,std,count
im_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Indian_87,0,30,21.213203,2
Asian_620,2,29,19.091883,2
Asian_570,2,35,18.502252,3
Asian_96,0,26,18.384776,2
African_44,6,32,18.384776,2


In [117]:
# Distribution of IDs per age group
ylfw_witha.groupby('children_agegroup').im_id.nunique().sort_values(ascending=False)

children_agegroup
4-6      1169
0-3      1123
7-9       987
10-12     670
16-18     650
13-15     428
Name: im_id, dtype: int64

In [118]:
# Distribution of age groups - based on number of images per age group
ylfw_witha.children_agegroup.value_counts().sort_values(ascending=False)

children_agegroup
0-3      2669
4-6      2092
7-9      1698
10-12    1005
16-18     835
13-15     551
Name: count, dtype: int64

In [119]:
# Take minority age group BASED ON N_IMAGES
min_agegroup = ylfw_witha.groupby('children_agegroup').img_name.count().sort_values(ascending=False).idxmin()
print(min_agegroup)

13-15


In [120]:
# Get racial distribution of number of images in minority age group
minority_agedf = ylfw_witha[ylfw_witha.children_agegroup == min_agegroup]
minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)

ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

In [121]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
22,African_634_22,15,African,634,13-15,African_634,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,
62,African_699_1,13,African,699,13-15,African_699,
164,African_412_47,13,African,412,13-15,African_412,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,
...,...,...,...,...,...,...,...
9897,m.05lq0j_0003,14,Asian,05lq0j,13-15,m.05lq0j,3.0
9906,m.05fwlf_0002,13,Asian,05fwlf,13-15,m.05fwlf,2.0
9913,m.0bmd0_6_0002,15,Asian,0bmd0_6,13-15,m.0bmd0_6,2.0
9915,m.0bmd0_6_0001,13,Asian,0bmd0_6,13-15,m.0bmd0_6,1.0


I.e. Remove samples in the other age groups such that they have approximately the same amount of images and same racial distribution. This is done by sampling the same amount of images within each ethnicity group. 

In [122]:
print("min age group: ", min_agegroup, "\nnumber of images: ", len(minority_agedf),
      "\n\nracial distribution:", minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False))

min age group:  13-15 
number of images:  551 

racial distribution: ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64


OBS inden dette: lav ethnicity dist for hver age group, for måske også og se hvilken der er mest balanced, og så balancér ift denne?? eller er det too much? 

In [123]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
22,African_634_22,15,African,634,13-15,African_634,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,
62,African_699_1,13,African,699,13-15,African_699,
164,African_412_47,13,African,412,13-15,African_412,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,
...,...,...,...,...,...,...,...
9897,m.05lq0j_0003,14,Asian,05lq0j,13-15,m.05lq0j,3.0
9906,m.05fwlf_0002,13,Asian,05fwlf,13-15,m.05fwlf,2.0
9913,m.0bmd0_6_0002,15,Asian,0bmd0_6,13-15,m.0bmd0_6,2.0
9915,m.0bmd0_6_0001,13,Asian,0bmd0_6,13-15,m.0bmd0_6,1.0


In [124]:
# get dist of images per ID in min group
minority_agedf.groupby("im_id").agg({'img_name': ['count']}).describe()

Unnamed: 0_level_0,img_name
Unnamed: 0_level_1,count
count,428.0
mean,1.287383
std,0.622851
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,4.0


In [125]:
print("Average number of ims per id")
minority_agedf.groupby("im_id").agg({'img_name': ['count']}).mean()

Average number of ims per id


img_name  count    1.287383
dtype: float64

In [126]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
22,African_634_22,15,African,634,13-15,African_634,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,
62,African_699_1,13,African,699,13-15,African_699,
164,African_412_47,13,African,412,13-15,African_412,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,
...,...,...,...,...,...,...,...
9897,m.05lq0j_0003,14,Asian,05lq0j,13-15,m.05lq0j,3.0
9906,m.05fwlf_0002,13,Asian,05fwlf,13-15,m.05fwlf,2.0
9913,m.0bmd0_6_0002,15,Asian,0bmd0_6,13-15,m.0bmd0_6,2.0
9915,m.0bmd0_6_0001,13,Asian,0bmd0_6,13-15,m.0bmd0_6,1.0


In [127]:
minority_agedf

Unnamed: 0,img_name,Age,ethnicity,id_class,children_agegroup,im_id,face_id
22,African_634_22,15,African,634,13-15,African_634,
60,Caucasian_346_12,14,Caucasian,346,13-15,Caucasian_346,
62,African_699_1,13,African,699,13-15,African_699,
164,African_412_47,13,African,412,13-15,African_412,
168,Asian_489_46,13,Asian,489,13-15,Asian_489,
...,...,...,...,...,...,...,...
9897,m.05lq0j_0003,14,Asian,05lq0j,13-15,m.05lq0j,3.0
9906,m.05fwlf_0002,13,Asian,05fwlf,13-15,m.05fwlf,2.0
9913,m.0bmd0_6_0002,15,Asian,0bmd0_6,13-15,m.0bmd0_6,2.0
9915,m.0bmd0_6_0001,13,Asian,0bmd0_6,13-15,m.0bmd0_6,1.0


In [128]:
etnicities = list(ylfw_witha.ethnicity.unique())

# Save variables of frequency within each ethnicity
minority_etnicity_dist = minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False)
minority_n_ids = len(minority_agedf) # kommer automatisk at nå ca samme fordeling


# Get this distribution of frequency in ethnicities from each age group
ylfw_witha_balanced = minority_agedf.copy()
for agegroup in age_labels:
    if agegroup != min_agegroup:
        agegroup_df = ylfw_witha[ylfw_witha.children_agegroup == agegroup]
        nunique_ids = agegroup_df.im_id.nunique()
        #get number of ids based on percentage dist of the minority age group
        for e in etnicities:
            freq_e = minority_etnicity_dist[e]

            # randomly sample the images from the current age group from this ethnicity
            age_ethn_df = agegroup_df[agegroup_df.ethnicity == e]
            sample_eids = age_ethn_df.sample(n=freq_e, random_state=42)

            # add theese to the minority dataset
            ylfw_witha_balanced = pd.concat([ylfw_witha_balanced, sample_eids], ignore_index=True)


Check if balanced

In [129]:
# Age groups
ylfw_witha_balanced.children_agegroup.value_counts()

children_agegroup
13-15    551
0-3      551
4-6      551
7-9      551
10-12    551
16-18    551
Name: count, dtype: int64

In [130]:
# Race distribution
print("Min age group distribution", minority_agedf.groupby('ethnicity').img_name.count().sort_values(ascending=False))
print("\n0-3\n",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "0-3"].groupby('ethnicity').img_name.count().sort_values(ascending=False))
print("\n16-18",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "16-18"].groupby('ethnicity').img_name.count().sort_values(ascending=False))

Min age group distribution ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

0-3
 ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64

16-18 ethnicity
African      164
Indian       158
Caucasian    117
Asian        112
Name: img_name, dtype: int64


In [131]:
ylfw_witha_balanced.children_agegroup.value_counts()

children_agegroup
13-15    551
0-3      551
4-6      551
7-9      551
10-12    551
16-18    551
Name: count, dtype: int64

In [132]:
ylfw_witha_balanced.groupby("im_id").count().sort_values(by="img_name", ascending=False)

Unnamed: 0_level_0,img_name,Age,ethnicity,id_class,children_agegroup,face_id
im_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Indian_484,8,8,8,8,8,0
Indian_577,7,7,7,7,7,0
Indian_602,7,7,7,7,7,0
Indian_531,7,7,7,7,7,0
Indian_667,7,7,7,7,7,0
...,...,...,...,...,...,...
Caucasian_1115,1,1,1,1,1,0
Caucasian_1112,1,1,1,1,1,0
Caucasian_1108,1,1,1,1,1,0
Caucasian_1107,1,1,1,1,1,0


In [133]:
ylfw_witha_balanced.to_csv('../data/child_balanced.csv', index=False)

## 2. Balance Data in adult group and children's group


Mht race?

In [134]:
# Racial distribution
ylfw_witha_balanced.ethnicity.value_counts(normalize=True)

ethnicity
African      0.297641
Indian       0.286751
Caucasian    0.212341
Asian        0.203267
Name: proportion, dtype: float64

In [135]:
# Remove childrens age group from adult data
a_df = pd.read_csv('../data/raw_full/raw_rfw_df.csv')
a_df = a_df[a_df.Age > 18]

# Check racial distribution
a_df.ethnicity.value_counts(normalize=True)

ethnicity
Indian       0.256337
African      0.255932
Caucasian    0.250645
Asian        0.237086
Name: proportion, dtype: float64

In [136]:
etnicities = list(ylfw_witha_balanced.ethnicity.unique())

# Save variables of frequency within each ethnicity
etnicity_dist = ylfw_witha_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)
a_balanced = pd.DataFrame()
for e in etnicities:
    freq_e = etnicity_dist[e]

    # randomly sample the images from the current age group from this ethnicity
    age_ethn_df = a_df[a_df.ethnicity == e]
    sample_eids = age_ethn_df.sample(n=freq_e, random_state=42)

    # add theese to the minority dataset
    a_balanced = pd.concat([a_balanced, sample_eids], ignore_index=True)


In [137]:
a_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)

ethnicity
African      984
Indian       948
Caucasian    702
Asian        672
Name: img_name, dtype: int64

In [138]:
ylfw_witha_balanced.groupby('ethnicity').img_name.count().sort_values(ascending=False)


ethnicity
African      984
Indian       948
Caucasian    702
Asian        672
Name: img_name, dtype: int64

In [139]:
a_balanced

Unnamed: 0,img_name,im_id,ethnicity,id_class,face_id,Age,children_agegroup
0,m.0fw3k1_0002,m.0fw3k1,African,0fw3k1,2,20,
1,m.04zvg12_0002,m.04zvg12,African,04zvg12,2,24,
2,m.05mrf5k_0002,m.05mrf5k,African,05mrf5k,2,28,
3,m.07b34k_0003,m.07b34k,African,07b34k,3,44,
4,m.02xgm5_0002,m.02xgm5,African,02xgm5,2,34,
...,...,...,...,...,...,...,...
3301,m.0h949qw_0001,m.0h949qw,Indian,0h949qw,1,22,
3302,m.0bhcm0z_0003,m.0bhcm0z,Indian,0bhcm0z,3,32,
3303,m.0bgrs7_0002,m.0bgrs7,Indian,0bgrs7,2,20,
3304,m.0b4y2t_0003,m.0b4y2t,Indian,0b4y2t,3,38,


In [140]:
a_balanced.to_csv('../data/adults_balanced.csv', index=False)

# BALANCE DATA FINAL

**Balance på ID.** 

- For children: Behold så mange enrolled som der er i minority of fyld op med non-enrolled. For adults: Fyld op med halvdelen af images der skal være mated og så halvdelen af images der er non-mated. 

In [78]:

# Balance child data 
def balance_child_data(y_df, a_df, print_stats=False, random_state=42):
    """
    Input: raw df for ylfw and rfw
    Returns: csvs with equally balanced children and adults
    Original child_balanced has random state 42
    """
    
    ### 1. Undersample based on the minority class in the children's age group in YLFW
    # - Keep racial distribution as in minority class
    # - Delete based on IDs 
    # (obs be aware that each id have images in different age groups). I.e. The minority class will also have other ids in other age groups. 
    # Therefore we sample based on number of images per age group. 
    
    age_labels = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18']
    
    # Take minority age group BASED ON N_IMAGES, and make dataframe
    min_agegroup = y_df.groupby('children_agegroup').image_name.count().sort_values(ascending=False).idxmin()
    minority_agedf = y_df[y_df.children_agegroup == min_agegroup]
    

    # Remove samples in the other age groups such that they have approximately the same amount of images and same racial distribution. 
    # This is done by sampling the same amount of images within each ethnicity group. 

    # Save variables of frequency within each ethnicity
    etnicities = list(y_df.ethnicity.unique())
    minority_etnicity_dist = minority_agedf.groupby('ethnicity').image_name.count().sort_values(ascending=False)

    # Get this distribution of frequency in ethnicities from each age group
    ylfw_witha_balanced = minority_agedf.copy()
    for agegroup in age_labels:
        if agegroup != min_agegroup:
            agegroup_df = y_df[y_df.children_agegroup == agegroup]

            # Get number of ids based on percentage dist of the minority age group
            for e in etnicities:
                freq_e = minority_etnicity_dist[e]

                # Randomly sample the images from the current age group from this ethnicity
                age_ethn_df = agegroup_df[agegroup_df.ethnicity == e]
                sample_eids = age_ethn_df.sample(n=freq_e, random_state=random_state)

                # Add theese to the minority dataset - to create a balanced dataset with the other age groups
                ylfw_witha_balanced = pd.concat([ylfw_witha_balanced, sample_eids], ignore_index=True)
                
    if print_stats:
        # Print Race distribution
        print("minority age group from childrens data: ", min_agegroup, "\nnumber of images: ", len(minority_agedf),
                "\n\nracial distribution:", minority_agedf.groupby('ethnicity').image_name.count().sort_values(ascending=False),
                "\n\nAll new groups should have same distribution") 
                

        print("\nOther age group stats:","\n0-3\n",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "0-3"].groupby('ethnicity').image_name.count().sort_values(ascending=False))
        print("\n16-18",ylfw_witha_balanced[ylfw_witha_balanced.children_agegroup == "16-18"].groupby('ethnicity').image_name.count().sort_values(ascending=False))
                
        print("Balanced data?:", ylfw_witha_balanced.children_agegroup.value_counts())
        
    return ylfw_witha_balanced
        

## Example use:
#y_df = pd.read_csv('../data/raw_full/raw_ylfw_df.csv')
#a_df = pd.read_csv('../data/raw_full/raw_rfw_df.csv')

# save as csv
#ylfw_witha_balanced = balance_data(y_df, a_df, print_stats=True)
#ylfw_witha_balanced.to_csv('../data/child_balanced.csv', index=False)

In [79]:
random_states = [1,2,3,4,5,6,7,8,9,10]
children_all = pd.read_csv('../data/YLFW_full_info_excluding_adults.csv')
a_df = pd.read_csv('../data/RFW_full_info_excluding_children.csv')
children_balanced_df_1 = balance_child_data(children_all, a_df, print_stats=True, random_state=random_states[0])


minority age group from childrens data:  16-18 
number of images:  383 

racial distribution: ethnicity
African      148
Indian       134
Asian         65
Caucasian     36
Name: image_name, dtype: int64 

All new groups should have same distribution

Other age group stats: 
0-3
 ethnicity
African      148
Indian       134
Asian         65
Caucasian     36
Name: image_name, dtype: int64

16-18 ethnicity
African      148
Indian       134
Asian         65
Caucasian     36
Name: image_name, dtype: int64
Balanced data?: children_agegroup
16-18    383
0-3      383
4-6      383
7-9      383
10-12    383
13-15    383
Name: count, dtype: int64


In [209]:
from collections import Counter


In [299]:

# Balance adults data 
def balance_adults_data_enrolled(children_balanced_df_i, a_df, print_stats=False, random_state=42):
    """
    Input: adults full df and balanced child df at iteration i. Set random state equal to random state i for generation of children balanced df
    Returns: balanced adults df with equally balanced distribution of ethnicities and enrolled/non_enrolled ids as in children balanced df
    """
    
    random.seed(random_state)
    
    # Split in mated and non-mated ids
    c_mates = children_balanced_df_i.groupby("identity_name").agg({'identity_name': ['count']})
    c_enrolled_ids = c_mates[c_mates[('identity_name', 'count')] > 1].index
    c_non_enrolled_ids = c_mates[c_mates[('identity_name', 'count')] == 1].index
    
    
    a_mates = a_df.groupby("identity_name").agg({'identity_name': ['count']})
    a_enrolled_ids = a_mates[a_mates[('identity_name', 'count')] > 1].index

    # Get distribution to stratify on. 
    c_enrolled_df = children_balanced_df_i[children_balanced_df_i["identity_name"].isin(set(c_enrolled_ids))]
    c_enrolled_ethnicity = c_enrolled_df.groupby('ethnicity').identity_name.nunique().sort_values(ascending=False)
    c_non_enrolled_df = children_balanced_df_i[children_balanced_df_i["identity_name"].isin(set(c_non_enrolled_ids))]
    c_non_enrolled_ethnicity = c_non_enrolled_df.groupby('ethnicity').identity_name.nunique().sort_values(ascending=False)
    #print(c_enrolled_ethnicity)
    
    etnicities = list(children_balanced_df_i.ethnicity.unique())
    a_balanced = pd.DataFrame()
    for e in etnicities:
        # a_df of etnicity group e
        a_ethnicity_df = a_df[a_df.ethnicity == e]
        
        
        ## For enrolled ids:
        n_enrolled_e = c_enrolled_ethnicity[e] # number of enrolled ids in ethnicity e in children
        
        # Randomly sample this number of ids and corresponding images from a_df in etnicity group e        
        a_enrolled_ethnicity_ids = a_ethnicity_df[a_ethnicity_df["identity_name"].isin(set(a_enrolled_ids))].identity_name.unique()
        random_sample_enrolled_ids = random.sample(list(a_enrolled_ethnicity_ids), n_enrolled_e) # same size as enrolled ids in ethnicity e in children 
        
        #print("is child ids same as adults ids number", n_enrolled_e,len(random_sample_enrolled_ids)  )     
        
        a_enrolled_ethnicity_df = a_ethnicity_df[a_ethnicity_df["identity_name"].isin(set(random_sample_enrolled_ids))] # "final sampling"
        
        # Add theese to balanced adults dataset
        a_balanced = pd.concat([a_balanced, a_enrolled_ethnicity_df], ignore_index=True)
        
        ## ^god

 
        ## For non-enrolled ids:
        n_non_enrolled_e = c_non_enrolled_ethnicity[e] # number of enrolled ids in ethnicity e in children
        
        # identities allowed to sample from
        a_non_enrolled_ethnicity_ids = a_ethnicity_df[~a_ethnicity_df["identity_name"].isin(set(random_sample_enrolled_ids))].identity_name.unique()
        #print("all_ids with enrolled",len(set(a_ethnicity_df.identity_name)))

        #print("list without enrolled ids",len(set(a_non_enrolled_ethnicity_ids))) # fint - den er uden enrolled ids
     
        #print(list(a_non_enrolled_ethnicity_ids[:10]))
        # Shuffle the list
        random.shuffle(list(a_non_enrolled_ethnicity_ids))
        #print(a_non_enrolled_ethnicity_ids[:10])

        # Take the first n_non_en elements
        random_sample_non_enrolled_ids = a_non_enrolled_ethnicity_ids[:n_non_enrolled_e]
        
        # for each of these ids, take one image
        
        a_non_enrolled_ethnicity_ids = a_ethnicity_df[a_ethnicity_df["identity_name"].isin(random_sample_non_enrolled_ids)] # "final sampling"
        a_non_enrolled_ethnicity_image_names = a_non_enrolled_ethnicity_ids.groupby('identity_name')['image_name'].first().reset_index().image_name.unique()
        
        # Get org df with these img names
        a_non_enrolled_ethnicity_df = a_ethnicity_df[a_ethnicity_df["image_name"].isin(set(a_non_enrolled_ethnicity_image_names))] # "final sampling"

       
                
                
        print("FINAL", len(a_non_enrolled_ethnicity_df), n_non_enrolled_e)

        # Count occurrences of each element
        counts = Counter(a_non_enrolled_ethnicity_df.identity_name)

        # Get the number of duplicates
        num_duplicates = sum(count for count in counts.values() if count > 1)


        print("DDDUPS",num_duplicates)

        print("is child ids same as adults ids number non en", n_non_enrolled_e,len(a_non_enrolled_ethnicity_df)  )     

        
        # add theese to balanced adults dataset
        a_balanced = pd.concat([a_balanced, a_non_enrolled_ethnicity_df], ignore_index=True)
        
    if print_stats:
        a_bal_mates = a_balanced.groupby("identity_name").agg({'identity_name': ['count']})
        a_bal_enrolled_ids = a_bal_mates[a_bal_mates[('identity_name', 'count')] > 1].index
        a_bal_non_enrolled_ids = a_bal_mates[a_bal_mates[('identity_name', 'count')] == 1].index

        print("Balanced data between adults and children?:",
            "\n\nadults: ", a_balanced.groupby('ethnicity').identity_name.nunique().sort_values(ascending=False), 
            "\nnumber of enrolled, and non-enrolled ids (a): ", len(set(a_bal_enrolled_ids)), len(set(a_bal_non_enrolled_ids)),
            "\n\nchildren: ", children_balanced_df_i.groupby('ethnicity').identity_name.nunique().sort_values(ascending=False),
            "\nnumber of enrolled, and non-enrolled ids (c): ", len(set(c_enrolled_ids)), len(set(c_non_enrolled_ids)))

    return a_balanced

In [302]:
random_states = [1,2,3,4,5,6,7,8,9,10]
children_all = pd.read_csv('../data/YLFW_full_info_excluding_adults.csv')
a_df = pd.read_csv('../data/RFW_full_info_excluding_children.csv')
children_balanced_df_1 = balance_child_data(children_all, a_df, print_stats=False, random_state=random_states[0])


balance_adults_data_enrolled(children_balanced_df_1, a_df, print_stats=True, random_state=random_states[0])


FINAL 182 182
DDDUPS 0
is child ids same as adults ids number non en 182 182
FINAL 246 246
DDDUPS 0
is child ids same as adults ids number non en 246 246
FINAL 246 246
DDDUPS 0
is child ids same as adults ids number non en 246 246
FINAL 141 141
DDDUPS 0
is child ids same as adults ids number non en 141 141
Balanced data between adults and children?: 

adults:  ethnicity
African      500
Indian       444
Asian        265
Caucasian    175
Name: identity_name, dtype: int64 
number of enrolled, and non-enrolled ids (a):  569 815 

children:  ethnicity
African      500
Indian       444
Asian        265
Caucasian    175
Name: identity_name, dtype: int64 
number of enrolled, and non-enrolled ids (c):  569 815


Unnamed: 0,image_name,identity_name,ethnicity,Age,children_agegroup
0,m.013yvd_0001,m.013yvd,Asian,72,
1,m.013yvd_0002,m.013yvd,Asian,51,
2,m.013yvd_0003,m.013yvd,Asian,55,
3,m.013yvd_0004,m.013yvd,Asian,49,
4,m.013yvd_0005,m.013yvd,Asian,63,
...,...,...,...,...,...
2686,m.01lb3wy_0001,m.01lb3wy,Caucasian,21,
2687,m.01lgdc_0001,m.01lgdc,Caucasian,61,
2688,m.01lgg3s_0001,m.01lgg3s,Caucasian,39,
2689,m.01ljk0w_0001,m.01ljk0w,Caucasian,37,


In [310]:
a_df[a_df.identity_name == "m.03f25c1"]

Unnamed: 0,image_name,identity_name,ethnicity,Age,children_agegroup
29499,m.03f25c1_0001,m.03f25c1,Indian,22,
29500,m.03f25c1_0002,m.03f25c1,Indian,32,
29501,m.03f25c1_0003,m.03f25c1,Indian,75,
29502,m.03f25c1_0004,m.03f25c1,Indian,67,


In [312]:
a_df.groupby("identity_name")['Age'].agg(['count', 'std']).describe(percentiles=[0.25, 0.5, 0.75, 0.9])

Unnamed: 0,count,std
count,11336.0,10961.0
mean,3.211627,4.492621
std,1.044882,3.411003
min,1.0,0.0
25%,3.0,2.12132
50%,3.0,3.605551
75%,4.0,5.737305
90%,5.0,8.746428
max,8.0,26.870058


In [309]:
a_df.groupby("identity_name")['Age'].agg(['count', 'std']).reset_index().sort_values(by="std", ascending=False)

Unnamed: 0,identity_name,count,std
9887,m.0gh7fk,2,26.870058
3244,m.03f25c1,4,25.935818
147,m.019s7b,3,25.735838
3723,m.03v4c7,4,25.526130
3700,m.03sz56,2,25.455844
...,...,...,...
11215,m.0qft6yv,1,
11257,m.0r8ntwm,1,
11258,m.0r8x_gx,1,
11264,m.0rpftqy,1,
