# Data Context

We are using AR face database which is public and access is free. To enable detailed testing and
model building the AR face images have been manually labelled with 22 facial features on each
face. The 22 points chosen are consistent across all images. This labelled database contains face
images of 136 persons (76 men & 60 women). Images feature frontal view faces with different facial
expressions and illumination conditions.

# Data Format

- Male images are stored as: m-xx-yy.pts
- Females as: w-xx-yy.pts
- 'xx' is a unique person identifier (from "00" to "76" for men and from "00" to "60" for
women). 'yy' specifies expression or lighting condition. Its meanings are described as
follows:

```sh
1: Neutral expression
2: Smile
3: Anger
5: left light on
```

# Extract Workload

The core focus of the extract workload is to group together specific categories between gender
and facial expressions/emotions. As such, we are going to compile together each gender into their
own folder, which breaks them down by facial expressions/emotions, and each of their entries will 
have their unique person identifier to represent the invidual accurately. The result of this extraction
workflow will be dumped into a folder called `ex_res`, where the compiled data will be in CSV file(s).

## Verify FaceMarkupARDatabase


In [1]:
import hashlib
import os

def hash_file(file_path):
    # Generate a hash for a file
    hasher = hashlib.sha256()
    with open(file_path, 'rb') as file:
        while True:
            data = file.read(65536)  # Read in 64k chunks
            if not data:
                break
            hasher.update(data)
    return hasher.hexdigest()

def hash_folder(folder_path):
    # Generate a hash for a folder
    folder_hasher = hashlib.sha256()
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)
            folder_hasher.update(hash_file(file_path).encode('utf-8'))
    return folder_hasher.hexdigest()

folder_path = "../../FaceMarkupARDatabase"
expected_hash = "a3b9a9a41f515586fb41411eb6b184dd3d291d7a04f2d3d0a8527181d1ded25a"
folder_hash = hash_folder(folder_path)

if folder_hash != expected_hash:
    raise ValueError(f"The hash of the folder does not match the expected value. Expected: {expected_hash}, Actual: {folder_hash}")
else:
    print("FaceMarkupARDatabase has been verified")

FaceMarkupARDatabase has been verified


## Create Extract Results Folder At Project Root

In [2]:
import os

def create_folder_structure():
    # Get the root directory of the Git project
    root_dir = os.getcwd()

    # Define the path for the ex_res folder
    ex_res_folder = os.path.join(root_dir, "ex_res")

    # Define the paths for the female and male folders
    female_folder = os.path.join(ex_res_folder, "female")
    male_folder = os.path.join(ex_res_folder, "male")

    # Define the paths for the female subfolders
    female_subfolders = ["anger", "left_light", "neutral", "smile"]
    male_subfolders = ["anger", "left_light", "neutral", "smile"]

    # Check if the folder structure already exists
    if not os.path.exists(ex_res_folder):
        # If the ex_res folder does not exist, create the whole structure
        os.makedirs(ex_res_folder)
        for folder in [female_folder, male_folder]:
            os.makedirs(folder)
            for subfolder in female_subfolders if folder == female_folder else male_subfolders:
                os.makedirs(os.path.join(folder, subfolder))
        print("Folder structure created successfully.")
    else:
        # Check if all subfolders exist, if not, raise an error
        for folder in [female_folder, male_folder]:
            if not os.path.exists(folder):
                raise Exception("Folder structure is not complete. Please delete the 'ex_res' folder and run the script again to recreate it.")
            for subfolder in female_subfolders if folder == female_folder else male_subfolders:
                if not os.path.exists(os.path.join(folder, subfolder)):
                    raise Exception("Folder structure is not complete. Please delete the 'ex_res' folder and run the script again to recreate it.")
        print("Folder structure is complete.")

create_folder_structure()

Folder structure is complete.


## Compile each facial expression by gender and emotion

We want to compile each facial expression into the below folder structure. Within each of the folder(s) will be compiled together CSV file(s) that represent all of the gender's specific facial expression in one location. Each of these micro-repositories of our entire dataset will be transformed and potentially combined together depending on the model type or visualization that is being attempted to be generated.

```js
└───ex_res
    ├───female
    │   ├───anger
    │   ├───left_light
    │   ├───neutral
    │   └───smile
    └───male
        ├───anger
        ├───left_light
        ├───neutral
        └───smile
```

In [3]:
from dataclasses import dataclass
from typing import List

@dataclass
class Point:
    x: float
    y: float
    
def read_points_from_file(file_name):
    points: List[Point] = []
    with open(file_name, 'r') as file:
        lines = file.readlines()
        for line in lines:
            # Remove all trailing whitespaces from line, and if that returns None for the line, skip the line
            if line.strip() is None:
                continue
            
            # Ignore all strings that are not points representing facial expression
            if line.startswith(('version', 'n_points', '{', '}')):
                continue
                
            x, y = map(float, line.split())
            points.append(Point(x, y))
    
    return points

In [4]:
from enum import Enum
import pandas as pd

class Emotion(Enum):
    NEUTRAL = 1
    SMILE = 2
    ANGER = 3
    LEFT_LIGHT = 5

def convert_data_to_df(gender: str, person_id: str, expression: str, face_points: List[Point]):
    column_names = ['gender', 'person_id', 'expression']
    column_names.extend([f'p_{i+1}_x' for i in range(len(face_points))])
    column_names.extend([f'p_{i+1}_y' for i in range(len(face_points))])

    print(column_names)


In [5]:
database_path = "../../FaceMarkupARDatabase/points_22"
def traverse_facial_expressions():
    # Walk through the FaceMarkupARDatabase/points_22 folder
    for dirpath, _, files in os.walk(database_path):
        if 'dummy.pts' in files: # Skip dummy.pts, as its not relevant
            continue
        for file in files:
            if file.endswith(".pts"):
                # Extract Gender & Person Unique ID & Expression
                gender, person_id, expression = file.split('-')
                # Expression also contains the suffix of the file extension
                # Expression is always two digits that goes from 01, 02, 03, 05
                expression = expression[:2]
                facial_expression_points: List[Point] = read_points_from_file(os.path.join(dirpath, file))
                convert_data_to_df(gender, person_id, expression, facial_expression_points)
                

traverse_facial_expressions()

['gender', 'person_id', 'expression', 'p_1_x', 'p_2_x', 'p_3_x', 'p_4_x', 'p_5_x', 'p_6_x', 'p_7_x', 'p_8_x', 'p_9_x', 'p_10_x', 'p_11_x', 'p_12_x', 'p_13_x', 'p_14_x', 'p_15_x', 'p_16_x', 'p_17_x', 'p_18_x', 'p_19_x', 'p_20_x', 'p_21_x', 'p_22_x', 'p_1_y', 'p_2_y', 'p_3_y', 'p_4_y', 'p_5_y', 'p_6_y', 'p_7_y', 'p_8_y', 'p_9_y', 'p_10_y', 'p_11_y', 'p_12_y', 'p_13_y', 'p_14_y', 'p_15_y', 'p_16_y', 'p_17_y', 'p_18_y', 'p_19_y', 'p_20_y', 'p_21_y', 'p_22_y']
['gender', 'person_id', 'expression', 'p_1_x', 'p_2_x', 'p_3_x', 'p_4_x', 'p_5_x', 'p_6_x', 'p_7_x', 'p_8_x', 'p_9_x', 'p_10_x', 'p_11_x', 'p_12_x', 'p_13_x', 'p_14_x', 'p_15_x', 'p_16_x', 'p_17_x', 'p_18_x', 'p_19_x', 'p_20_x', 'p_21_x', 'p_22_x', 'p_1_y', 'p_2_y', 'p_3_y', 'p_4_y', 'p_5_y', 'p_6_y', 'p_7_y', 'p_8_y', 'p_9_y', 'p_10_y', 'p_11_y', 'p_12_y', 'p_13_y', 'p_14_y', 'p_15_y', 'p_16_y', 'p_17_y', 'p_18_y', 'p_19_y', 'p_20_y', 'p_21_y', 'p_22_y']
['gender', 'person_id', 'expression', 'p_1_x', 'p_2_x', 'p_3_x', 'p_4_x', 'p_5_x

### FacialExpression Datum Definitions

In [15]:
from dataclasses import dataclass
from typing import List

@dataclass
class FacialExpression:
    Gender: str  # 'm' or 'f'
    PersonIdentifier: str  # e.g., 001
    Emotion: Emotion
    Points: List[Point]
    
    def __post_init__(self):
        # Check if there are exactly 22 points
        if len(self.Points) != 22:
            raise ValueError("Facial expression must contain exactly 22 points.")

# Example usage:
# points = [Point(x=328.444, y=275.496), Point(x=434.921, y=275.029), Point(x=331.713, y=401.121),
#           Point(x=427.449, y=400.187), Point(x=271.936, y=270.826), Point(x=356.464, y=254.014),
#           Point(x=388.221, y=255.882), Point(x=494.698, y=268.491), Point(x=263.997, y=274.095),
#           Point(x=301.825, y=277.831), Point(x=349.459, y=277.364), Point(x=411.104, y=278.298),
#           Point(x=459.673, y=277.831), Point(x=515.246, y=276.897), Point(x=368.606, y=340.41),
#           Point(x=355.53, y=351.151), Point(x=391.957, y=350.217), Point(x=374.253, y=395.527),
#           Point(x=374.253, y=416.925), Point(x=373.276, y=483.314), Point(x=280.342, y=404.39),
#           Point(x=499.835, y=402.522)]

# Example usage:
# facial_expression = FacialExpression(Gender='f', PersonIdentifier='001', Emotion=Emotion.NEUTRAL, Points=points)
# print(facial_expression)