# Mated adults 



In [1]:
import os

def list_files_from_subfolders(root_dir):
    """
    List files from subdirectories within the 'adults' root directory which contain more than one subdirectory.

    Parameters:
        root_dir (str): The root directory containing the 'adults' directory.

    Returns:
        list of str: Filenames prefixed with their immediate parent directory name from subdirectories that contain more than one subdirectory.
    """
    all_files = []  # List to store filenames
    single_files=[]
    # Check if the root_dir exists and is a directory
    if not os.path.isdir(root_dir):
        print(f"The path {root_dir} is not a valid directory.")
        return all_files

    # Iterate over the items in the 'adults' directory
    for item in os.listdir(root_dir):
        item_path = os.path.join(root_dir, item)
        # Proceed only if the item is a directory
        if os.path.isdir(item_path):
            subdirs = [d for d in os.listdir(item_path) if os.path.isdir(os.path.join(item_path, d))]
            # If the current directory contains more than one subdirectory
            if len(subdirs) > 1:
                # Iterate over each subdirectory
                for subdir in subdirs:
                    subdir_path = os.path.join(item_path, subdir)
                    # Add the file names within this subdirectory to the list
                    files = [os.path.join(item, subdir, f) for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))]
                    all_files.extend(files)
            elif len(subdirs)==1:
                for subdir in subdirs:
                    subdir_path = os.path.join(item_path, subdir)
                    # Add the file names within this subdirectory to the list
                    files = [os.path.join(item, subdir, f) for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))]
                    single_files.extend(files)

    return all_files, single_files



def list_files_from_folders_with_multiple_files_child(root_dir):
    """
    List files from folders within the 'children' root directory that contain more than one file.

    Parameters:
        root_dir (str): The root directory containing the 'children' directory.

    Returns:
        list of str: Strings representing each file in the folders that contain more than one file.
    """
    folder_files_list = []  # List to store folder/file strings
    single_files_list = []

    # Iterate over the items in the 'children' directory
    for folder_name in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder_name)
        # Proceed only if the item is a directory
        if os.path.isdir(folder_path):
            files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
            # Proceed only if the folder contains more than one file
            if len(files) > 1:
                # Create a string for each file
                for file in files:
                    folder_files_list.append(f"{folder_name}/{file}")
            elif len(files) == 1:
                for file in files:
                    single_files_list.append(f"{folder_name}/{file}")
    return folder_files_list,single_files_list


In [2]:
def extract_unique_identifiers(file_list):
    """
    Extracts a unique identifier from a list of file paths.

    Parameters:
        file_list (list): A list of strings containing file paths.

    Returns:
        list: A list of unique identifiers extracted from the file paths.
    """
    identifiers = []
    for file_path in file_list:
        # Extract the base filename without extension and path
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        identifiers.append(base_name)

    return identifiers


def drop_after_zeros(s):
    """
    Drops everything from the input string after the first occurrence of three consecutive zeros
    and also drops the last underscore before the zeros.

    Parameters:
        s (str): The input string.

    Returns:
        str: The string up to, but not including, the last underscore before the three consecutive zeros.
    """
    # Find the index of the first occurrence of three consecutive zeros
    zero_index = s.find('000')

    # If three consecutive zeros are found
    if zero_index != -1:
        # Find the last underscore before the '000'
        underscore_index = s.rfind('_', 0, zero_index)
        # If an underscore is found, return the substring up to that point
        if underscore_index != -1:
            return s[:underscore_index]
        else:
            return s[:zero_index]
    else:
        return s  # If no '000' sequence is found, return the original string

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import json
import numpy as np
def load_data(path):
    f = open(path)
    data = json.load(f)
    df = pd.DataFrame.from_dict(data, orient='index', columns=['Age'])
    f.close()

    # removing nans
    nans = df['Age'].isna().sum()
    df = df.dropna()

    describe = df.describe()
    value_counts = df.value_counts()

    # make new column for the identity based on the index
    df['Identity'] = df.index
    df['Identity'] = df.index.str.rsplit('_', 1).str[0]

    df = df.sort_values(by='Age')

    return df, nans, describe, value_counts

import pandas as pd

def drop_number_after_last_underscore(df, column_name):
    """
    Drops the number after the last underscore from the string in the specified column of a DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to process.
        column_name (str): The name of the column to process.

    Returns:
        pd.DataFrame: The DataFrame with the modified column.
    """
    # Check if the column exists in the DataFrame
    if column_name in df.columns:
        # Split the string by underscore and drop the last part
        df[column_name] = df[column_name].apply(lambda x: '_'.join(x.split('_')[:-1]))
    else:
        raise ValueError(f"The column {column_name} does not exist in the DataFrame.")

    return df


In [3]:
df_YLFW = load_data('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/age_estimations/YLFW_output_data_precroped_all_age.json')[0]
df_RFW_african = load_data('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/age_estimations/data sendt fra Gabi senest/output_african.json')[0]
df_RFW_asian = load_data('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/age_estimations/data sendt fra Gabi senest/output_asian.json')[0]
df_RFW_caucasian = load_data('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/age_estimations/data sendt fra Gabi senest/output_caucasian.json')[0]
df_RFW_indian = load_data('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/age_estimations/data sendt fra Gabi senest/output_indian.json')[0]
df_RFW = pd.concat([df_RFW_african, df_RFW_asian, df_RFW_caucasian, df_RFW_indian])
df_RFW.Age = df_RFW.Age.astype(int)
df_RFW = df_RFW.sort_values(by='Age', ascending=True)

age_df = pd.concat([df_YLFW,df_RFW])
age_df['image_name'] = age_df.index

# Mated adults

In [4]:

adults_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/adults/'
files_list = list_files_from_subfolders(adults_dir)[0]
image_names = extract_unique_identifiers(files_list)

identity_names = []
for i in image_names:
    identity_name = drop_after_zeros(i)

    identity_names.append(identity_name)

DF = pd.DataFrame(
    {'files_list': files_list,
     'image_name': image_names,
     'identity_name': identity_names,
     'enrolled' : 'enrolled'})
DF['ethnicity'] = DF['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
OFIQ_a = pd.read_csv('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/OFIQ_results/adults_all_final.csv', sep=';')
OFIQ_c = pd.read_csv('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/OFIQ_results/children_all_final.csv', sep=';')
OFIQ_c['image_name']= OFIQ_c['Filename'].apply(lambda x: x.split('/')[-1])
OFIQ_c['image_name']= OFIQ_c['image_name'].apply(lambda x: x.split('.')[0])
OFIQ_a['image_name']=OFIQ_a['Filename'].apply(lambda x: x.split('/')[-1])
OFIQ_a['image_name']=OFIQ_a['image_name'].apply(lambda x: x.split('.jpg')[0])
OFIQ = pd.concat([OFIQ_a,OFIQ_c])

final_adult = pd.merge(DF, age_df, on='image_name', how='left')
final_adult = final_adult.merge(OFIQ[['image_name', 'UnifiedQualityScore.scalar']],
                                on='image_name',
                                how='left')
final_adult.to_csv('mated_adults_image_info.csv', index=False)

In [5]:
final_adult

Unnamed: 0,files_list,image_name,identity_name,enrolled,ethnicity,Age,Identity,UnifiedQualityScore.scalar
0,African_m.012mh_/m.012mh__0001.jpg/m.012mh__00...,m.012mh__0001,m.012mh_,enrolled,African,24,m.012mh_,59.0
1,African_m.012mh_/m.012mh__0005.jpg/m.012mh__00...,m.012mh__0005,m.012mh_,enrolled,African,24,m.012mh_,46.0
2,African_m.01c_3f/m.01c_3f_0002.jpg/m.01c_3f_00...,m.01c_3f_0002,m.01c_3f,enrolled,African,43,m.01c_3f,19.0
3,African_m.01c_3f/m.01c_3f_0003.jpg/m.01c_3f_00...,m.01c_3f_0003,m.01c_3f,enrolled,African,33,m.01c_3f,49.0
4,African_m.01jq08_/m.01jq08__0001.jpg/m.01jq08_...,m.01jq08__0001,m.01jq08_,enrolled,African,41,m.01jq08_,23.0
...,...,...,...,...,...,...,...,...
768,Indian_m.0nhj8xl/m.0nhj8xl_0004.jpg/m.0nhj8xl_...,m.0nhj8xl_0004,m.0nhj8xl,enrolled,Indian,25,m.0nhj8xl,17.0
769,Indian_m.0q2y9_v/m.0q2y9_v_0002.jpg/m.0q2y9_v_...,m.0q2y9_v_0002,m.0q2y9_v,enrolled,Indian,25,m.0q2y9_v,44.0
770,Indian_m.0q2y9_v/m.0q2y9_v_0003.jpg/m.0q2y9_v_...,m.0q2y9_v_0003,m.0q2y9_v,enrolled,Indian,30,m.0q2y9_v,4.0
771,Indian_m.0tj9/m.0tj9_0002.jpg/m.0tj9_0002.jpg,m.0tj9_0002,m.0tj9,enrolled,Indian,38,m.0tj9,24.0


# Nonmated adults


In [6]:
adults_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/adults/'
files_list = list_files_from_subfolders(adults_dir)[1]
image_names = extract_unique_identifiers(files_list)

identity_names = []
for i in image_names:
    identity_name = drop_after_zeros(i)

    identity_names.append(identity_name)

DF = pd.DataFrame(
    {'files_list': files_list,
     'image_name': image_names,
     'identity_name': identity_names,
     'enrolled' : 'non_enrolled'})
DF['ethnicity'] = DF['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
OFIQ_a = pd.read_csv('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/OFIQ_results/adults_all_final.csv', sep=';')
OFIQ_c = pd.read_csv('/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/OFIQ_results/children_all_final.csv', sep=';')
OFIQ_c['image_name']= OFIQ_c['Filename'].apply(lambda x: x.split('/')[-1])
OFIQ_c['image_name']= OFIQ_c['image_name'].apply(lambda x: x.split('.')[0])
OFIQ_a['image_name']=OFIQ_a['Filename'].apply(lambda x: x.split('/')[-1])
OFIQ_a['image_name']=OFIQ_a['image_name'].apply(lambda x: x.split('.jpg')[0])
OFIQ = pd.concat([OFIQ_a,OFIQ_c])

final_adult = pd.merge(DF, age_df, on='image_name', how='left')
final_adult_non = final_adult.merge(OFIQ[['image_name', 'UnifiedQualityScore.scalar']],
                                on='image_name',
                                how='left')
final_adult_non.to_csv('nonmated_adults_image_info.csv', index=False)

In [7]:
final_adult_non

Unnamed: 0,files_list,image_name,identity_name,enrolled,ethnicity,Age,Identity,UnifiedQualityScore.scalar
0,African_m.015pz3/m.015pz3_0003.jpg/m.015pz3_00...,m.015pz3_0003,m.015pz3,non_enrolled,African,56,m.015pz3,51.0
1,African_m.015q3m/m.015q3m_0004.jpg/m.015q3m_00...,m.015q3m_0004,m.015q3m,non_enrolled,African,35,m.015q3m,17.0
2,African_m.0183qt/m.0183qt_0006.jpg/m.0183qt_00...,m.0183qt_0006,m.0183qt,non_enrolled,African,39,m.0183qt,32.0
3,African_m.01f97r/m.01f97r_0003.jpg/m.01f97r_00...,m.01f97r_0003,m.01f97r,non_enrolled,African,57,m.01f97r,53.0
4,African_m.01flf5/m.01flf5_0003.jpg/m.01flf5_00...,m.01flf5_0003,m.01flf5,non_enrolled,African,33,m.01flf5,1.0
...,...,...,...,...,...,...,...,...
2724,Indian_m.0qfr7xm/m.0qfr7xm_0002.jpg/m.0qfr7xm_...,m.0qfr7xm_0002,m.0qfr7xm,non_enrolled,Indian,23,m.0qfr7xm,8.0
2725,Indian_m.0r8l061/m.0r8l061_0002.jpg/m.0r8l061_...,m.0r8l061_0002,m.0r8l061,non_enrolled,Indian,21,m.0r8l061,8.0
2726,Indian_m.0r8ntwm/m.0r8ntwm_0001.jpg/m.0r8ntwm_...,m.0r8ntwm_0001,m.0r8ntwm,non_enrolled,Indian,23,m.0r8ntwm,10.0
2727,Indian_m.0rpj1kv/m.0rpj1kv_0003.jpg/m.0rpj1kv_...,m.0rpj1kv_0003,m.0rpj1kv,non_enrolled,Indian,38,m.0rpj1kv,73.0


# Mated children

In [13]:
child_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/children/'
# files_list = list_files_from_subfolders(child_dir)
files_list_child = list_files_from_folders_with_multiple_files_child(child_dir)[0]

image_names = extract_unique_identifiers(files_list_child)

identity_names = []
for i in image_names:
    identity_names.append(drop_after_zeros(i))
DF_child = pd.DataFrame(
    {'files_list': files_list_child,
     'image_name': image_names,
     'enrolled' : 'enrolled'})

DF_child['identity_name'] = DF_child['image_name'].apply(lambda x: '_'.join(x.split('_')[:-1]) if isinstance(x, str) and x.split() else None)
DF_child['ethnicity'] = DF_child['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
final_child = pd.merge(DF_child, age_df, on='image_name', how='left')
final_child_mated = final_child.merge(OFIQ[['image_name', 'UnifiedQualityScore.scalar']],
                                on='image_name',
                                how='left')


In [14]:
final_child_mated.to_csv('mated_children_image_info.csv', index=False)

# Non mated children

In [15]:
child_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/children/'
files_list_child = list_files_from_folders_with_multiple_files_child(child_dir)[1]

image_names = extract_unique_identifiers(files_list_child)

identity_names = []
for i in image_names:
    identity_names.append(drop_after_zeros(i))
DF_child = pd.DataFrame(
    {'files_list': files_list_child,
     'image_name': image_names,
     'enrolled' : 'non_enrolled'})

DF_child['identity_name'] = DF_child['image_name'].apply(lambda x: '_'.join(x.split('_')[:-1]) if isinstance(x, str) and x.split() else None)
DF_child['ethnicity'] = DF_child['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
final_child = pd.merge(DF_child, age_df, on='image_name', how='left')
final_child_non = final_child.merge(OFIQ[['image_name', 'UnifiedQualityScore.scalar']],
                                on='image_name',
                                how='left')
final_child_non.to_csv('nonmated_children_image_info.csv', index=False)

In [16]:
final_child

Unnamed: 0,files_list,image_name,enrolled,identity_name,ethnicity,Age,Identity
0,African_0/African_0_1.png,African_0_1,non_enrolled,African_0,African,18,African_0
1,African_1/African_1_11.png,African_1_11,non_enrolled,African_1,African,5,African_1
2,African_108/African_108_9.png,African_108_9,non_enrolled,African_108,African,0,African_108
3,African_110/African_110_0.png,African_110_0,non_enrolled,African_110,African,14,African_110
4,African_116/African_116_3.png,African_116_3,non_enrolled,African_116,African,3,African_116
...,...,...,...,...,...,...,...
1213,Indian_m.0hnbj9t/m.0hnbj9t_0001.jpg,m.0hnbj9t_0001,non_enrolled,m.0hnbj9t,Indian,18,m.0hnbj9t
1214,Indian_m.0hncksb/m.0hncksb_0001.jpg,m.0hncksb_0001,non_enrolled,m.0hncksb,Indian,18,m.0hncksb
1215,Indian_m.0j4c85h/m.0j4c85h_0006.jpg,m.0j4c85h_0006,non_enrolled,m.0j4c85h,Indian,17,m.0j4c85h
1216,Indian_m.0k3208/m.0k3208_0001.jpg,m.0k3208_0001,non_enrolled,m.0k3208,Indian,18,m.0k3208
