# Feature Engineering

Using the results of the extract workload, we have our minimally preprocessed data that we can use to craft the following features.

1. **Eye length ratio**: Distance between points 9 and 10 (or distance between points 11 and 12, whichever is larger) over distance between points 8 and 13.
2. **Eye distance ratio**: distance between center of two eyes (points 0 and points 1) over distance between points 8 and 13.
3. **Nose ratio**: Distance between points 15 and 16 over distance between 20 and 21.
4. **Lip size ratio**: Distance between points 2 and 3 over distance between 17 and 18.
5. **Lip length ratio**: Distance between points 2 and 3 over distance between 20 and 21.
6. **Eye-brow length ratio**: Distance between points 4 and 5 (or distance between points 6 and 7 whichever is larger) over distance between 8 and 13.
7. **Aggressive ratio**: Distance between points 10 and 19 over distance between 20 and 21.

## Fetch Extract Workload Results

In [6]:
import pandas as pd

ex_file_path = "../extract/ex_res/ex_res.csv"

ex_df = pd.read_csv(ex_file_path)

ex_df.head()

Unnamed: 0,index,gender,person_id,neutral,smile,anger,left_light,p_0_x,p_0_y,p_1_x,...,p_17_x,p_17_y,p_18_x,p_18_y,p_19_x,p_19_y,p_20_x,p_20_y,p_21_x,p_21_y
0,0,1,m061,0,1,0,0,337.065,268.794,450.991,...,398.359,366.73,397.026,400.041,398.359,464.666,286.432,378.722,511.618,375.391
1,1,1,m061,1,0,0,0,327.346,262.923,441.29,...,386.599,374.627,387.108,393.943,389.141,462.566,280.87,383.777,497.921,383.777
2,2,1,m061,0,0,0,1,325.671,257.013,443.241,...,385.823,364.557,386.734,383.696,386.734,455.696,275.544,379.139,489.722,377.316
3,3,1,m061,0,0,1,0,323.919,295.817,439.65,...,380.946,420.353,380.946,442.577,380.107,497.088,292.471,414.902,485.775,421.192
4,4,1,m066,1,0,0,0,331.813,243.564,455.946,...,390.842,362.18,389.509,375.857,388.926,465.606,262.461,373.525,516.556,377.605


## Craft Features from Extract Dataframe

In [7]:
import numpy as np
import os

# Function to calculate distance between two points
# Euclidean Distance Formula
def distance(x1, y1, x2, y2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

column_names = ['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light', 
                'EyeLengthRatio', 'EyeDistanceRatio', 'NoseRatio', 'LipSizeRatio', 'LipLengthRatio', 'EyeBrowLengthRatio', 'AggressiveRatio']

# Calculate the features
ex_features_df = pd.DataFrame(columns=column_names)

ex_features_df[['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light']] = ex_df[['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light']]

# Eye length ratio: Distance between points 9 and 10 (or distance between points 11 and 12, whichever is larger) over distance between points 8 and 13.
ex_features_df['EyeLengthRatio'] = ex_df.apply(lambda row:
                                               max(distance(row['p_9_x'], row['p_9_y'], row['p_10_x'], row['p_10_y']),
                                                   distance(row['p_11_x'], row['p_11_y'], row['p_12_x'], row['p_12_y']))
                                                   / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                                   axis=1)

# Eye distance ratio: distance between center of two eyes (points 0 and points 1) over distance between points 8 and 13
ex_features_df['EyeDistanceRatio'] = ex_df.apply(lambda row: 
                                                 distance(row['p_0_x'], row['p_0_y'], row['p_1_x'], row['p_1_y']) / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                                axis=1)

# Nose ratio: Distance between points 15 and 16 over distance between 20 and 21.
ex_features_df['NoseRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_15_x'], row['p_15_y'], row['p_16_x'], row['p_16_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

# Lip size ratio: Distance between points 2 and 3 over distance between 17 and 18.
ex_features_df['LipSizeRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_2_x'], row['p_2_y'], row['p_3_x'], row['p_3_y']) / distance(row['p_17_x'], row['p_17_y'], row['p_18_x'], row['p_18_y']),
                                          axis=1)

# Lip length ratio: Distance between points 2 and 3 over distance between 20 and 21.
ex_features_df['LipLengthRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_2_x'], row['p_2_y'], row['p_3_x'], row['p_3_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

# Eye-brow length ratio: Distance between points 4 and 5 (or distance between points 6 and 7 whichever is larger) over distance between 8 and 13.
ex_features_df['EyeBrowLengthRatio'] = ex_df.apply(lambda row:
                                            max(distance(row['p_4_x'], row['p_4_y'], row['p_5_x'], row['p_5_y']),
                                                distance(row['p_6_x'], row['p_6_y'], row['p_7_x'], row['p_7_y']))
                                            / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                          axis=1)

# Aggressive ratio: Distance between points 10 and 19 over distance between 20 and 21.
ex_features_df['AggressiveRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_10_x'], row['p_10_y'], row['p_19_x'], row['p_19_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

ex_features_df.to_csv(os.path.join(os.getcwd(), "original_features.csv"), index=True, index_label='index')

In [8]:
test_df = pd.read_csv("original_features.csv")
test_df.head()

Unnamed: 0,index,gender,person_id,neutral,smile,anger,left_light,EyeLengthRatio,EyeDistanceRatio,NoseRatio,LipSizeRatio,LipLengthRatio,EyeBrowLengthRatio,AggressiveRatio
0,0,1,m061,0,1,0,0,0.194331,0.472633,0.153858,3.137626,0.464459,0.330827,0.894086
1,1,1,m061,1,0,0,0,0.198848,0.493716,0.142875,4.646419,0.413642,0.341821,0.941084
2,2,1,m061,0,0,0,1,0.234807,0.540105,0.144736,4.566581,0.408518,0.373072,0.928303
3,3,1,m061,0,0,1,0,0.267124,0.518789,0.143158,3.834783,0.440649,0.392359,1.011202
4,4,1,m066,1,0,0,0,0.183555,0.465034,0.135613,7.93617,0.429144,0.308485,0.874718


## Model Ready Dataframe

We still require some pre-processing to make our feature-engineered dataframe to be ready for model training.

All of our features values will also perform best when scaled down to the same metric such as [0,1] from MinMax or Z-Score.
From visualizing each of the coordinates, none of the coordinates had an extreme skew that I was able to visually gather, and as such,
I ended up using Z-Score Scaling via the StandardScaler.

- We will encode person_id by numerical labels to represent each person.
- Then, the following columns will be scaled down via the StandardScaler from sklearn, which uses z-score scaling.
    

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_ready_df = ex_features_df.copy()

# Columns to be scaled
columns_to_scale = ['EyeLengthRatio', 'EyeDistanceRatio', 'NoseRatio', 'LipSizeRatio', 'LipLengthRatio', 'EyeBrowLengthRatio', 'AggressiveRatio']
model_ready_df[columns_to_scale] = scaler.fit_transform(model_ready_df[columns_to_scale])

model_ready_df.to_csv(os.path.join(os.getcwd(), "model_ready.csv"), index=True, index_label='index')
model_ready_df

Unnamed: 0,gender,person_id,neutral,smile,anger,left_light,EyeLengthRatio,EyeDistanceRatio,NoseRatio,LipSizeRatio,LipLengthRatio,EyeBrowLengthRatio,AggressiveRatio
0,1,m061,0,1,0,0,-0.904499,0.321063,-0.184364,-0.415108,0.036263,-0.734205,-1.083244
1,1,m061,1,0,0,0,-0.689475,1.268312,-0.771988,1.115821,-1.054635,-0.322105,-0.306320
2,1,m061,0,0,0,1,1.022126,3.352544,-0.672417,1.034812,-1.164627,0.849298,-0.517595
3,1,m061,0,0,1,0,2.560423,2.394853,-0.756831,0.292277,-0.474880,1.572250,0.852805
4,1,m066,1,0,0,0,-1.417437,-0.020343,-1.160558,4.453840,-0.721856,-1.571687,-1.403420
...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,0,w035,1,0,0,0,0.397592,-0.927034,0.979556,0.098176,0.040218,-0.384020,-0.285400
505,0,w019,0,1,0,0,-0.008132,-0.582822,-0.720324,-0.046376,0.405714,-1.649490,-1.553974
506,0,w019,0,0,1,0,1.216665,-0.747395,-0.958332,0.037289,-1.299745,-0.781771,-0.726962
507,0,w019,1,0,0,0,0.357238,-0.874948,-0.937050,0.272577,-0.880769,-0.808017,-0.674302
