## Resulting data frames to use for descriptive stats

This notebook loads the original data frames and removes age std above 5 for adults and outliers above 10 for children.

Additionally, adults data with OFIQ score below the 20th quantile are removed. 

In [1]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)
import numpy as np
from collections import Counter


In [12]:
# Dataframes with info, removes names not in magface results
children_all = pd.read_csv('../../data/image_info_csvs/YLFW_full_info_including_children_from_adults.csv')
adults_all = pd.read_csv('../../data/image_info_csvs/RFW_full_info_excluding_children.csv')

In [13]:
children_all

Unnamed: 0,files_list,image_name,enrolled,identity_name,ethnicity,Age,Identity,UnifiedQualityScore.scalar,HeadPoseYaw.scalar,HeadPosePitch.scalar,HeadPoseRoll.scalar,children_agegroup
0,African_0/African_0_0.png,African_0_0,enrolled,African_0,African,7,African_0,34.0,100.0,93.0,100.0,7-9
1,African_1/African_1_11.png,African_1_11,enrolled,African_1,African,5,African_1,65.0,100.0,99.0,100.0,4-6
2,African_1/African_1_4.png,African_1_4,enrolled,African_1,African,2,African_1,23.0,95.0,99.0,100.0,1-3
3,African_10/African_10_1.png,African_10_1,enrolled,African_10,African,5,African_10,50.0,100.0,97.0,100.0,4-6
4,African_10/African_10_3.png,African_10_3,enrolled,African_10,African,6,African_10,25.0,90.0,100.0,100.0,4-6
...,...,...,...,...,...,...,...,...,...,...,...,...
7812,Indian_m.0h9448x/m.0h9448x_0002.jpg,m.0h9448x_0002,enrolled,m.0h9448x,Indian,14,m.0h9448x,86.0,100.0,100.0,100.0,13-15
7813,Indian_m.0hncksb/m.0hncksb_0003.jpg,m.0hncksb_0003,enrolled,m.0hncksb,Indian,11,m.0hncksb,20.0,96.0,94.0,100.0,10-12
7814,Indian_m.0j63px2/m.0j63px2_0003.jpg,m.0j63px2_0003,enrolled,m.0j63px2,Indian,12,m.0j63px2,88.0,99.0,98.0,100.0,10-12
7815,Indian_m.0j63px2/m.0j63px2_0004.jpg,m.0j63px2_0004,enrolled,m.0j63px2,Indian,15,m.0j63px2,24.0,64.0,98.0,99.0,13-15


In [14]:
adults_all

Unnamed: 0,files_list,image_name,identity_name,enrolled,ethnicity,Age,Identity,UnifiedQualityScore.scalar,HeadPoseYaw.scalar,HeadPosePitch.scalar,HeadPoseRoll.scalar,children_agegroup
0,African_m.010lz5/m.010lz5_0001.jpg,m.010lz5_0001,m.010lz5,enrolled,African,38,m.010lz5,35,100,100,100,
1,African_m.010lz5/m.010lz5_0002.jpg,m.010lz5_0002,m.010lz5,enrolled,African,29,m.010lz5,40,76,81,97,
2,African_m.010lz5/m.010lz5_0003.jpg,m.010lz5_0003,m.010lz5,enrolled,African,37,m.010lz5,34,99,76,100,
3,African_m.011y5k/m.011y5k_0001.jpg,m.011y5k_0001,m.011y5k,enrolled,African,40,m.011y5k,5,90,100,100,
4,African_m.011y5k/m.011y5k_0002.jpg,m.011y5k_0002,m.011y5k,enrolled,African,43,m.011y5k,51,100,100,100,
...,...,...,...,...,...,...,...,...,...,...,...,...
39355,Indian_m.0vshsz9/m.0vshsz9_0003.jpg,m.0vshsz9_0003,m.0vshsz9,enrolled,Indian,25,m.0vshsz9,76,99,96,100,
39356,Indian_m.0_4pw/m.0_4pw_0001.jpg,m.0_4pw_0001,m.0_4pw,enrolled,Indian,38,m.0_4pw,28,100,100,100,
39357,Indian_m.0_4pw/m.0_4pw_0002.jpg,m.0_4pw_0002,m.0_4pw,enrolled,Indian,64,m.0_4pw,16,90,100,100,
39358,Indian_m.0_4pw/m.0_4pw_0003.jpg,m.0_4pw_0003,m.0_4pw,enrolled,Indian,37,m.0_4pw,24,89,99,100,


### OFIQ

#### Stratify på alder

**Adults age std - before**

In [15]:
# ADULTS STD DEVIATIONS - BEFORE

### All reference image names, enrolled and non-enrolled image names - adults ###
a_mates = adults_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_a = a_mates[a_mates[('identity_name', 'count')] > 1].index



adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    11238.000000
mean         4.527498
std          3.340651
min          0.000000
25%          2.217356
50%          3.701351
75%          5.737305
90%          8.717798
95%         11.212343
max         34.648232
Name: Age, dtype: float64

**Remove adults data with age standard deviation above 5.** 

In [16]:
adults_age_std = adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().sort_values(ascending=False)
adults_below_5std = adults_age_std[adults_age_std >= 5].index
#all adults NOT with age std below 5
final_filtered_adults_df = adults_all[~adults_all.identity_name.isin(adults_below_5std)]


In [22]:
print(len(adults_all),len(adults_age_std), len(adults_below_5std), len(final_filtered_adults_df))

39360 11238 3675 26263


In [17]:
# check
final_filtered_adults_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    7563.000000
mean        2.751549
std         1.255654
min         0.000000
25%         1.732051
50%         2.774887
75%         3.774917
90%         4.509250
95%         4.725816
max         4.996666
Name: Age, dtype: float64

**Children age deviation - before**

In [18]:

### All reference image names, enrolled and non-enrolled image names - children ###
c_mates = children_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_c = c_mates[c_mates[('identity_name', 'count')] > 1].index



children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95, .99])

count    2098.000000
mean        1.401640
std         1.181351
min         0.000000
25%         0.577350
50%         1.154701
75%         2.061553
90%         2.886751
95%         3.703566
99%         5.132960
max         9.899495
Name: Age, dtype: float64

**Remove children with age deviation above 10**

In [19]:
children_age_std = children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().sort_values(ascending=False)
children_below_10std = children_age_std[children_age_std >= 10].index
final_filtered_children_df = children_all[~children_all.identity_name.isin(children_below_10std)]


In [20]:
#check
final_filtered_children_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    2098.000000
mean        1.401640
std         1.181351
min         0.000000
25%         0.577350
50%         1.154701
75%         2.061553
90%         2.886751
95%         3.703566
max         9.899495
Name: Age, dtype: float64

# Removing the worst OFIQ samples 

In [11]:
import os
os.getcwd()

'/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/notebooks/data_processing'

In [12]:
OFIQ_children = pd.read_csv('../../data/OFIQ_results/OFIQ_children_all.csv', sep=';')
OFIQ_adults = pd.read_csv('../../data/OFIQ_results/table_adults_all_in_one.csv', sep=';')

In [13]:
OFIQ_adults['image_name'] = OFIQ_adults.Filename.str.split('/').str[-1]
OFIQ_adults['image_name'] = OFIQ_adults['image_name'].apply(lambda x: os.path.splitext(x)[0])

In [14]:
np.percentile(OFIQ_adults['UnifiedQualityScore.scalar'], 30)

25.0

In [21]:
import numpy as np

# Calculate the threshold using numpy.percentile
threshold = np.percentile(OFIQ_adults['UnifiedQualityScore.scalar'], 30)

# Filter the DataFrame based on the threshold
OFIQ_adults_filtered = OFIQ_adults[OFIQ_adults['UnifiedQualityScore.scalar'] > threshold]


In [22]:
# add the column OFIQ_adults_filtered[]'UnifiedQualityScore.scalar'] to the final_filtered_adults_df based on image_name
final_filtered_adults_df_final = final_filtered_adults_df.merge(OFIQ_adults_filtered[['image_name', 'UnifiedQualityScore.scalar']], on='image_name', how='left')


In [23]:
final_filtered_adults_df_final["UnifiedQualityScore.scalar"].isna().sum()

7323

In [24]:
# drop children age column
final_filtered_adults_df_final = final_filtered_adults_df_final.drop(columns="children_agegroup")

In [25]:
# Drop nans corresponding to rows with bad ofiq score
final_filtered_adults_df_final = final_filtered_adults_df_final.dropna()

# Final filtered csvs

In [21]:
print(len(final_filtered_children_df), len(final_filtered_adults_df))

7817 26263


In [33]:
final_filtered_children_df.columns

Index(['files_list', 'image_name', 'enrolled', 'identity_name', 'ethnicity',
       'Age', 'Identity', 'UnifiedQualityScore.scalar', 'HeadPoseYaw.scalar',
       'HeadPosePitch.scalar', 'HeadPoseRoll.scalar', 'children_agegroup'],
      dtype='object')

In [34]:
high_OFIQ = final_filtered_children_df[final_filtered_children_df['UnifiedQualityScore.scalar'] > final_filtered_children_df['UnifiedQualityScore.scalar'].median()]

In [36]:
final_filtered_children_df['UnifiedQualityScore.scalar'].median()

24.0

In [35]:
high_OFIQ

Unnamed: 0,files_list,image_name,enrolled,identity_name,ethnicity,Age,Identity,UnifiedQualityScore.scalar,HeadPoseYaw.scalar,HeadPosePitch.scalar,HeadPoseRoll.scalar,children_agegroup
0,African_0/African_0_0.png,African_0_0,enrolled,African_0,African,7,African_0,34.0,100.0,93.0,100.0,7-9
1,African_1/African_1_11.png,African_1_11,enrolled,African_1,African,5,African_1,65.0,100.0,99.0,100.0,4-6
3,African_10/African_10_1.png,African_10_1,enrolled,African_10,African,5,African_10,50.0,100.0,97.0,100.0,4-6
4,African_10/African_10_3.png,African_10_3,enrolled,African_10,African,6,African_10,25.0,90.0,100.0,100.0,4-6
5,African_10/African_10_4.png,African_10_4,enrolled,African_10,African,6,African_10,71.0,100.0,100.0,100.0,4-6
...,...,...,...,...,...,...,...,...,...,...,...,...
7809,Indian_m.0h1fnvm/m.0h1fnvm_0003.jpg,m.0h1fnvm_0003,enrolled,m.0h1fnvm,Indian,12,m.0h1fnvm,40.0,70.0,97.0,99.0,10-12
7810,Indian_m.0h1fnvm/m.0h1fnvm_0004.jpg,m.0h1fnvm_0004,enrolled,m.0h1fnvm,Indian,6,m.0h1fnvm,28.0,100.0,99.0,100.0,4-6
7811,Indian_m.0h9448x/m.0h9448x_0001.jpg,m.0h9448x_0001,enrolled,m.0h9448x,Indian,14,m.0h9448x,53.0,99.0,100.0,100.0,13-15
7812,Indian_m.0h9448x/m.0h9448x_0002.jpg,m.0h9448x_0002,enrolled,m.0h9448x,Indian,14,m.0h9448x,86.0,100.0,100.0,100.0,13-15


In [39]:
high_OFIQ.to_csv('../../data/image_info_csvs/final_filtered_canonical_df_BIBLE.csv')

In [38]:
len(high_OFIQ['identity_name'].unique())

1741

In [37]:

counts = high_OFIQ['identity_name'].value_counts()

# Names that appear once
names_once = counts[counts == 1].count()

# Names that appear more than once
names_more_than_once = counts[counts > 1].count()

print(names_once, names_more_than_once)

730 1011


In [23]:
final_filtered_children_df.to_csv('../../data/image_info_csvs/final_filtered_children_df_BIBEL.csv')
final_filtered_adults_df.to_csv('../../data/image_info_csvs/final_filtered_adults_df_BIBEL.csv')