## Resulting data frames to use for descriptive stats

This notebook loads the original data frames and removes age std above 5 for adults and outliers above 10 for children.

Additionally, adults data with OFIQ score below the 20th quantile are removed. 

In [1]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)
import numpy as np
from collections import Counter


In [2]:
# Dataframes with info, removes names not in magface results
children_all = pd.read_csv('../../data/image_info_csvs/YLFW_full_info_including_children_from_adults.csv')
adults_all = pd.read_csv('../../data/image_info_csvs/RFW_full_info_excluding_children.csv')

### OFIQ

#### Stratify på alder

**Adults age std - before**

In [3]:
# ADULTS STD DEVIATIONS - BEFORE

### All reference image names, enrolled and non-enrolled image names - adults ###
a_mates = adults_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_a = a_mates[a_mates[('identity_name', 'count')] > 1].index



adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    11238.000000
mean         4.527498
std          3.340651
min          0.000000
25%          2.217356
50%          3.701351
75%          5.737305
90%          8.717798
95%         11.212343
max         34.648232
Name: Age, dtype: float64

**Remove adults data with age standard deviation above 5.** 

In [4]:
adults_age_std = adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().sort_values(ascending=False)
adults_below_5std = adults_age_std[adults_age_std >= 10].index
#all adults NOT with age std below 5
final_filtered_adults_df = adults_all[~adults_all.identity_name.isin(adults_below_5std)]


In [5]:
print(len(adults_all),len(adults_age_std), len(adults_below_5std), len(final_filtered_adults_df))

39360 11238 772 36728


In [6]:
# check
final_filtered_adults_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    10466.000000
mean         3.869532
std          2.213228
min          0.000000
25%          2.121320
50%          3.535534
75%          5.202563
90%          7.090486
95%          8.185353
max          9.989995
Name: Age, dtype: float64

**Children age deviation - before**

In [7]:

### All reference image names, enrolled and non-enrolled image names - children ###
c_mates = children_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_c = c_mates[c_mates[('identity_name', 'count')] > 1].index

children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95, .99])

count    2098.000000
mean        1.401640
std         1.181351
min         0.000000
25%         0.577350
50%         1.154701
75%         2.061553
90%         2.886751
95%         3.703566
99%         5.132960
max         9.899495
Name: Age, dtype: float64

**Remove children with age deviation above 10**

In [8]:
children_age_std = children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().sort_values(ascending=False)
children_below_10std = children_age_std[children_age_std >= 10].index
final_filtered_children_df = children_all[~children_all.identity_name.isin(children_below_10std)]


In [9]:
#check
final_filtered_children_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    2098.000000
mean        1.401640
std         1.181351
min         0.000000
25%         0.577350
50%         1.154701
75%         2.061553
90%         2.886751
95%         3.703566
max         9.899495
Name: Age, dtype: float64

# Removing the worst OFIQ samples 

In [11]:
OFIQ_children = pd.read_csv('../../data/OFIQ_results/OFIQ_children_all.csv', sep=';')
# OFIQ_adults = pd.read_csv('../../data/OFIQ_results/table_adults_all_in_one.csv', sep=';')

# Final filtered csvs

In [15]:
final_filtered_children_df['UnifiedQualityScore.scalar'].mean()

28.390942816937443

In [16]:
high_OFIQ = final_filtered_children_df[final_filtered_children_df['UnifiedQualityScore.scalar'] > final_filtered_children_df['UnifiedQualityScore.scalar'].mean()]

In [20]:
high_OFIQ.to_csv('../../data/image_info_csvs/final_filtered_canonical_df_BIBLE.csv')

In [21]:

counts = high_OFIQ['identity_name'].value_counts()

# Names that appear once
names_once = counts[counts == 1].count()

# Names that appear more than once
names_more_than_once = counts[counts > 1].count()

print(names_once, names_more_than_once)

700 834


In [19]:

OFIQ_filtered_adults = pd.read_csv('../../data/image_info_csvs/Image_pairs_similarityscores_OFIQ_worst_dropped.csv')


In [20]:
print(len(OFIQ_filtered_adults['identity_name'].unique()))
print(len(OFIQ_filtered_adults['image_name'].unique()))

5433
11119


In [21]:

nyeste_måske = final_filtered_adults_df[final_filtered_adults_df.image_name.isin(OFIQ_filtered_adults['image_name'])]

print(len(nyeste_måske['identity_name'].unique()))
print(len(nyeste_måske['image_name'].unique()))

5139
10547


In [22]:
nyeste_måske

Unnamed: 0,files_list,image_name,identity_name,enrolled,ethnicity,Age,Identity,UnifiedQualityScore.scalar,HeadPoseYaw.scalar,HeadPosePitch.scalar,HeadPoseRoll.scalar,children_agegroup
25,African_m.012mh_/m.012mh__0004.jpg,m.012mh__0004,m.012mh_,enrolled,African,24,m.012mh_,51,97,85,100,
28,African_m.012mk7/m.012mk7_0001.jpg,m.012mk7_0001,m.012mk7,enrolled,African,33,m.012mk7,23,100,99,100,
31,African_m.012mmb/m.012mmb_0001.jpg,m.012mmb_0001,m.012mmb,enrolled,African,31,m.012mmb,44,93,95,100,
32,African_m.012mmb/m.012mmb_0002.jpg,m.012mmb_0002,m.012mmb,enrolled,African,30,m.012mmb,21,100,98,100,
34,African_m.012qsp/m.012qsp_0001.jpg,m.012qsp_0001,m.012qsp,enrolled,African,25,m.012qsp,48,99,97,100,
...,...,...,...,...,...,...,...,...,...,...,...,...
39340,Indian_m.0sgg_cr/m.0sgg_cr_0004.jpg,m.0sgg_cr_0004,m.0sgg_cr,enrolled,Indian,29,m.0sgg_cr,35,100,97,100,
39345,Indian_m.0tj9/m.0tj9_0001.jpg,m.0tj9_0001,m.0tj9,enrolled,Indian,49,m.0tj9,75,100,96,100,
39353,Indian_m.0vshsz9/m.0vshsz9_0001.jpg,m.0vshsz9_0001,m.0vshsz9,enrolled,Indian,34,m.0vshsz9,87,96,100,100,
39354,Indian_m.0vshsz9/m.0vshsz9_0002.jpg,m.0vshsz9_0002,m.0vshsz9,enrolled,Indian,36,m.0vshsz9,81,96,100,100,


In [23]:
# final_filtered_children_df.to_csv('../../data/image_info_csvs/final_filtered_children_df_BIBEL.csv')
nyeste_måske.to_csv('../../data/image_info_csvs/final_filtered_adults_df_BIBEL.csv')

In [11]:
# final_filtered_adults_df.to_csv('../../data/image_info_csvs/final_filtered_adults_df_BIBEL.csv')