# Mated adults 



In [1]:
import os

def list_files_from_subfolders(root_dir):
    """
    List files from subdirectories within the 'adults' root directory which contain more than one subdirectory.

    Parameters:
        root_dir (str): The root directory containing the 'adults' directory.

    Returns:
        list of str: Filenames prefixed with their immediate parent directory name from subdirectories that contain more than one subdirectory.
    """
    all_files = []  # List to store filenames

    # Check if the root_dir exists and is a directory
    if not os.path.isdir(root_dir):
        print(f"The path {root_dir} is not a valid directory.")
        return all_files

    # Iterate over the items in the 'adults' directory
    for item in os.listdir(root_dir):
        item_path = os.path.join(root_dir, item)
        # Proceed only if the item is a directory
        if os.path.isdir(item_path):
            subdirs = [d for d in os.listdir(item_path) if os.path.isdir(os.path.join(item_path, d))]
            # If the current directory contains more than one subdirectory
            if len(subdirs) > 1:
                # Iterate over each subdirectory
                for subdir in subdirs:
                    subdir_path = os.path.join(item_path, subdir)
                    # Add the file names within this subdirectory to the list
                    files = [os.path.join(item, subdir, f) for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))]
                    all_files.extend(files)

    return all_files



def list_files_from_folders_with_multiple_files_child(root_dir):
    """
    List files from folders within the 'children' root directory that contain more than one file.

    Parameters:
        root_dir (str): The root directory containing the 'children' directory.

    Returns:
        list of str: Strings representing each file in the folders that contain more than one file.
    """
    folder_files_list = []  # List to store folder/file strings

    # Iterate over the items in the 'children' directory
    for folder_name in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder_name)
        # Proceed only if the item is a directory
        if os.path.isdir(folder_path):
            files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
            # Proceed only if the folder contains more than one file
            if len(files) > 1:
                # Create a string for each file
                for file in files:
                    folder_files_list.append(f"{folder_name}/{file}")

    return folder_files_list


In [2]:
def extract_unique_identifiers(file_list):
    """
    Extracts a unique identifier from a list of file paths.

    Parameters:
        file_list (list): A list of strings containing file paths.

    Returns:
        list: A list of unique identifiers extracted from the file paths.
    """
    identifiers = []
    for file_path in file_list:
        # Extract the base filename without extension and path
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        identifiers.append(base_name)

    return identifiers


In [3]:
def drop_after_zeros(s):
    """
    Drops everything from the input string after the first occurrence of three consecutive zeros.

    Parameters:
        s (str): The input string.

    Returns:
        str: The string up to, but not including, the three consecutive zeros.
    """
    # Find the index of the first occurrence of three consecutive zeros
    index = s.find('000')

    # If three consecutive zeros are found, return the substring up to that point
    if index != -1:
        return s[:index]
    else:
        return s  # If no '000' sequence is found, return the original string


In [4]:
import pandas as pd

In [5]:


adults_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/adults/'
files_list = list_files_from_subfolders(adults_dir)
image_names = extract_unique_identifiers(files_list)

identity_names = []
for i in image_names:
    identity_names.append(drop_after_zeros(i))

DF = pd.DataFrame(
    {'files_list': files_list,
     'image_name': image_names,
     'identity_name': identity_names,
     'enrolled' : 'enrolled'})
DF['ethnicity'] = DF['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
DF.to_csv('mated_adults_image_info.csv', index=False)

In [18]:
DF.head()

Unnamed: 0,files_list,image_name,identity_name,enrolled,ethnicity
0,African_m.012mh_/m.012mh__0001.jpg/m.012mh__00...,m.012mh__0001,m.012mh__,enrolled,African
1,African_m.012mh_/m.012mh__0005.jpg/m.012mh__00...,m.012mh__0005,m.012mh__,enrolled,African
2,African_m.01c_3f/m.01c_3f_0002.jpg/m.01c_3f_00...,m.01c_3f_0002,m.01c_3f_,enrolled,African
3,African_m.01c_3f/m.01c_3f_0003.jpg/m.01c_3f_00...,m.01c_3f_0003,m.01c_3f_,enrolled,African
4,African_m.01jq08_/m.01jq08__0001.jpg/m.01jq08_...,m.01jq08__0001,m.01jq08__,enrolled,African


In [26]:
child_dir = '/mnt/c/Dokumenter/Dokumenter/UNI/Master/Thesis/GitHub_Repo/Master_Thesis/data/raw_full/children/'
# files_list = list_files_from_subfolders(child_dir)
files_list_child = list_files_from_folders_with_multiple_files_child(child_dir)


In [None]:
import pandas as pd

def drop_number_after_last_underscore(df, column_name):
    """
    Drops the number after the last underscore from the string in the specified column of a DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to process.
        column_name (str): The name of the column to process.

    Returns:
        pd.DataFrame: The DataFrame with the modified column.
    """
    # Check if the column exists in the DataFrame
    if column_name in df.columns:
        # Split the string by underscore and drop the last part
        df[column_name] = df[column_name].apply(lambda x: '_'.join(x.split('_')[:-1]))
    else:
        raise ValueError(f"The column {column_name} does not exist in the DataFrame.")

    return df

# Example DataFrame
df = pd.DataFrame({
    'identity_name': [
        'African_109_0',
        'African_109_3',
        'African_113_0',
        'African_113_12',
        'African_113_23'
    ]
})

# Apply the function to the 'identity_name' column
df_modified = drop_number_after_last_underscore(df, 'identity_name')
print(df_modified)


In [27]:
image_names = extract_unique_identifiers(files_list_child)

identity_names = []
for i in image_names:
    identity_names.append(drop_after_zeros(i))


In [40]:
DF_child = pd.DataFrame(
    {'files_list': files_list,
     'image_name': image_names,
     'identity_name': DF_child['image_name'].apply(lambda x: '_'.join(x.split('_')[:-1]) if isinstance(x, str) and x.split() else None),
     'enrolled' : 'enrolled'})
DF_child['ethnicity'] = DF_child['files_list'].apply(lambda x: x.split('_')[0] if isinstance(x, str) and x.split() else None)
DF_child.to_csv('mated_children_image_info.csv', index=False)