## Resulting data frames to use for descriptive stats

This notebook loads the original data frames and removes age std above 5 for adults and outliers above 10 for children.

Additionally, adults data with OFIQ score below the 20th quantile are removed. 

In [1]:
import pandas as pd
import seaborn as sns
sns.set()
import random
random.seed(42)
import numpy as np
from collections import Counter


In [2]:
# Dataframes with info, removes names not in magface results
children_all = pd.read_csv('../../data/image_info_csvs/YLFW_full_info_excluding_adults.csv')
adults_all = pd.read_csv('../../data/image_info_csvs/RFW_full_info_excluding_children.csv')

### OFIQ

#### Stratify på alder

**Adults age std - before**

In [3]:
# ADULTS STD DEVIATIONS - BEFORE

### All reference image names, enrolled and non-enrolled image names - adults ###
a_mates = adults_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_a = a_mates[a_mates[('identity_name', 'count')] > 1].index



adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    10961.000000
mean         4.492621
std          3.411003
min          0.000000
25%          2.121320
50%          3.605551
75%          5.737305
90%          8.746428
95%         11.313708
max         26.870058
Name: Age, dtype: float64

**Remove adults data with age standard deviation above 5.** 

In [4]:
adults_age_std = adults_all[adults_all.identity_name.isin(enrolled_identity_names_a)].groupby("identity_name").Age.std().sort_values(ascending=False)
adults_below_5std = adults_age_std[adults_age_std >= 5].index
#all adults NOT with age std below 5
final_filtered_adults_df = adults_all[~adults_all.identity_name.isin(adults_below_5std)]


In [5]:
# check
final_filtered_adults_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    7445.000000
mean        2.694945
std         1.276123
min         0.000000
25%         1.732051
50%         2.645751
75%         3.651484
90%         4.500000
95%         4.725816
max         5.000000
Name: Age, dtype: float64

**Children age deviation - before**

In [6]:

### All reference image names, enrolled and non-enrolled image names - children ###
c_mates = children_all.groupby("identity_name").agg({'identity_name': ['count']})
enrolled_identity_names_c = c_mates[c_mates[('identity_name', 'count')] > 1].index



children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95, .99])

count    2482.000000
mean        2.163802
std         2.274560
min         0.000000
25%         0.707107
50%         1.414214
75%         2.828427
90%         4.932883
95%         6.501401
99%        10.606602
max        21.213203
Name: Age, dtype: float64

**Remove children with age deviation above 10**

In [7]:
children_age_std = children_all[children_all.identity_name.isin(enrolled_identity_names_c)].groupby("identity_name").Age.std().sort_values(ascending=False)
children_below_10std = children_age_std[children_age_std >= 10].index
final_filtered_children_df = children_all[~children_all.identity_name.isin(children_below_10std)]


In [8]:
#check
final_filtered_children_df.groupby("identity_name").Age.std().describe(percentiles=[.25, .5, .75, .9, .95])

count    2451.000000
mean        2.018940
std         1.852467
min         0.000000
25%         0.707107
50%         1.414214
75%         2.652036
90%         4.582576
95%         5.852350
max         9.899495
Name: Age, dtype: float64

# Removing the worst OFIQ samples 

In [9]:
import os
os.getcwd()

'/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/notebooks/data_processing'

In [10]:
OFIQ_children = pd.read_csv('../../data/OFIQ_results/OFIQ_children_all.csv', sep=';')
OFIQ_adults = pd.read_csv('../../data/OFIQ_results/table_adults_all_in_one.csv', sep=';')

In [11]:
OFIQ_adults['image_name'] = OFIQ_adults.Filename.str.split('/').str[-1]
OFIQ_adults['image_name'] = OFIQ_adults['image_name'].apply(lambda x: os.path.splitext(x)[0])

In [12]:
np.percentile(OFIQ_adults['UnifiedQualityScore.scalar'], 50)

41.0

In [13]:
import numpy as np

# Calculate the threshold using numpy.percentile
threshold = np.percentile(OFIQ_adults['UnifiedQualityScore.scalar'], 20)

# Filter the DataFrame based on the threshold
OFIQ_adults_filtered = OFIQ_adults[OFIQ_adults['UnifiedQualityScore.scalar'] > threshold]


In [14]:
# add the column OFIQ_adults_filtered[]'UnifiedQualityScore.scalar'] to the final_filtered_adults_df based on image_name
final_filtered_adults_df_final = final_filtered_adults_df.merge(OFIQ_adults_filtered[['image_name', 'UnifiedQualityScore.scalar']], on='image_name', how='left')


In [15]:
final_filtered_adults_df_final["UnifiedQualityScore.scalar"].isna().sum()

5094

In [16]:
# drop children age column
final_filtered_adults_df_final = final_filtered_adults_df_final.drop(columns="children_agegroup")

In [17]:
# Drop nans corresponding to rows with bad ofiq score
final_filtered_adults_df_final = final_filtered_adults_df_final.dropna()

# Final filtered csvs

In [18]:
final_filtered_children_df.to_csv('../../data/image_info_csvs/final_filtered_children_df_BIBEL.csv')
final_filtered_adults_df.to_csv('../../data/image_info_csvs/final_filtered_adults_df_BIBEL.csv')