# Feature Engineering

Using the results of the extract workload, we have our minimally preprocessed data that we can use to craft the following features.

1. **Eye length ratio**: Distance between points 9 and 10 (or distance between points 11 and 12, whichever is larger) over distance between points 8 and 13.
2. **Eye distance ratio**: distance between center of two eyes (points 0 and points 1) over distance between points 8 and 13.
3. **Nose ratio**: Distance between points 15 and 16 over distance between 20 and 21.
4. **Lip size ratio**: Distance between points 2 and 3 over distance between 17 and 18.
5. **Lip length ratio**: Distance between points 2 and 3 over distance between 20 and 21.
6. **Eye-brow length ratio**: Distance between points 4 and 5 (or distance between points 6 and 7 whichever is larger) over distance between 8 and 13.
7. **Aggressive ratio**: Distance between points 10 and 19 over distance between 20 and 21.

## Fetch Extract Workload Results

In [42]:
import pandas as pd

ex_file_path = "../extract/ex_res/ex_res.csv"

ex_df = pd.read_csv(ex_file_path)

ex_df.head()

Unnamed: 0,index,gender,person_id,neutral,smile,anger,left_light,p_0_x,p_1_x,p_2_x,...,p_12_y,p_13_y,p_14_y,p_15_y,p_16_y,p_17_y,p_18_y,p_19_y,p_20_y,p_21_y
0,0,1,m001,1,0,0,0,328.444,275.496,434.921,...,374.253,395.527,374.253,416.925,373.276,483.314,280.342,404.39,499.835,402.522
1,1,1,m001,0,1,0,0,344.026,270.09,449.912,...,385.07,389.178,386.006,421.015,390.18,491.438,281.39,393.009,511.685,397.247
2,2,1,m001,0,0,1,0,329.17,291.426,436.553,...,376.119,414.293,378.616,440.265,380.614,505.195,279.723,418.289,501.982,416.291
3,3,1,m001,0,0,0,1,345.098,260.392,451.765,...,387.765,367.059,387.765,393.412,387.765,468.706,286.118,383.373,509.49,389.647
4,4,1,m002,1,0,0,0,327.193,263.025,437.671,...,386.155,367.268,386.155,394.529,388.628,472.462,282.728,382.655,499.719,377.464


## Craft Features from Extract Dataframe

In [44]:
import numpy as np
import os

# Function to calculate distance between two points
# Euclidean Distance Formula
def distance(x1, y1, x2, y2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

column_names = ['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light', 
                'EyeLengthRatio', 'EyeDistanceRatio', 'NoseRatio', 'LipSizeRatio', 'LipLengthRatio', 'EyeBrowLengthRatio', 'AggressiveRatio']

# Calculate the features
ex_features_df = pd.DataFrame(columns=column_names)

ex_features_df[['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light']] = ex_df[['gender', 'person_id', 'neutral', 'smile', 'anger', 'left_light']]

# Eye length ratio: Distance between points 9 and 10 (or distance between points 11 and 12, whichever is larger) over distance between points 8 and 13.
ex_features_df['EyeLengthRatio'] = ex_df.apply(lambda row:
                                               max(distance(row['p_9_x'], row['p_9_y'], row['p_10_x'], row['p_10_y']),
                                                   distance(row['p_11_x'], row['p_11_y'], row['p_12_x'], row['p_12_y']))
                                                   / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                                   axis=1)

# Eye distance ratio: distance between center of two eyes (points 0 and points 1) over distance between points 8 and 13
ex_features_df['EyeDistanceRatio'] = ex_df.apply(lambda row: 
                                                 distance(row['p_0_x'], row['p_0_y'], row['p_1_x'], row['p_1_y']) / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                                axis=1)

# Nose ratio: Distance between points 15 and 16 over distance between 20 and 21.
ex_features_df['NoseRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_15_x'], row['p_15_y'], row['p_16_x'], row['p_16_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

# Lip size ratio: Distance between points 2 and 3 over distance between 17 and 18.
ex_features_df['LipSizeRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_2_x'], row['p_2_y'], row['p_3_x'], row['p_3_y']) / distance(row['p_17_x'], row['p_17_y'], row['p_18_x'], row['p_18_y']),
                                          axis=1)

# Lip length ratio: Distance between points 2 and 3 over distance between 20 and 21.
ex_features_df['LipLengthRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_2_x'], row['p_2_y'], row['p_3_x'], row['p_3_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

# Eye-brow length ratio: Distance between points 4 and 5 (or distance between points 6 and 7 whichever is larger) over distance between 8 and 13.
ex_features_df['EyeBrowLengthRatio'] = ex_df.apply(lambda row:
                                            max(distance(row['p_4_x'], row['p_4_y'], row['p_5_x'], row['p_5_y']),
                                                distance(row['p_6_x'], row['p_6_y'], row['p_7_x'], row['p_7_y']))
                                            / distance(row['p_8_x'], row['p_8_y'], row['p_13_x'], row['p_13_y']),
                                          axis=1)

# Aggressive ratio: Distance between points 10 and 19 over distance between 20 and 21.
ex_features_df['AggressiveRatio'] = ex_df.apply(lambda row: 
                                          distance(row['p_10_x'], row['p_10_y'], row['p_19_x'], row['p_19_y']) / distance(row['p_20_x'], row['p_20_y'], row['p_21_x'], row['p_21_y']),
                                          axis=1)

ex_features_df.to_csv(os.path.join(os.getcwd(), "original_features.csv"), index=True, index_label='index')

In [48]:
test_df = pd.read_csv("original_features.csv")
test_df.head()

Unnamed: 0,index,gender,person_id,neutral,smile,anger,left_light,EyeLengthRatio,EyeDistanceRatio,NoseRatio,LipSizeRatio,LipLengthRatio,EyeBrowLengthRatio,AggressiveRatio
0,0,1,m001,1,0,0,0,3.163498,3.317317,0.362315,1.181994,1.999351,5.760029,0.657338
1,1,1,m001,0,1,0,0,4.069242,4.037923,0.231978,1.253686,1.815324,6.049963,0.726452
2,2,1,m001,0,0,1,0,2.105411,2.212617,0.595227,1.005074,2.160023,4.248729,0.695194
3,3,1,m001,0,0,0,1,3.334016,3.359626,0.166311,1.442655,1.749672,4.753959,0.758484
4,4,1,m002,1,0,0,0,3.957777,3.886707,0.241837,1.400323,1.804426,5.725302,0.655954


## Model Ready Dataframe

We still require some pre-processing to make our feature-engineered dataframe to be ready for model training.

All of our features values will also perform best when scaled down to the same metric such as [0,1] from MinMax or Z-Score.
From visualizing each of the coordinates, none of the coordinates had an extreme skew that I was able to visually gather, and as such,
I ended up using Z-Score Scaling via the StandardScaler.

- We will encode person_id by numerical labels to represent each person.
- Then, the following columns will be scaled down via the StandardScaler from sklearn, which uses z-score scaling.
    

In [55]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

label_encoder = LabelEncoder()
scaler = StandardScaler()

model_ready_df = ex_features_df.copy()
model_ready_df['person_id'] = label_encoder.fit_transform(model_ready_df['person_id'])

# Columns to be scaled
columns_to_scale = ['EyeLengthRatio', 'EyeDistanceRatio', 'NoseRatio', 'LipSizeRatio', 'LipLengthRatio', 'EyeBrowLengthRatio', 'AggressiveRatio']
model_ready_df[columns_to_scale] = scaler.fit_transform(model_ready_df[columns_to_scale])

model_ready_df.to_csv(os.path.join(os.getcwd(), "model_ready.csv"), index=True, index_label='index')
model_ready_df

Unnamed: 0,gender,person_id,neutral,smile,anger,left_light,EyeLengthRatio,EyeDistanceRatio,NoseRatio,LipSizeRatio,LipLengthRatio,EyeBrowLengthRatio,AggressiveRatio
0,1,0,1,0,0,0,-0.360257,-0.012008,-0.059605,-0.585311,-0.086159,0.403678,-0.248377
1,1,0,0,1,0,0,0.540739,0.727561,-0.232151,-0.360523,-0.293977,0.589782,-0.073556
2,1,0,0,0,1,0,-1.412798,-1.145776,0.248735,-1.140044,0.095284,-0.566405,-0.152623
3,1,0,0,0,0,1,-0.190633,0.031415,-0.319083,0.231989,-0.368117,-0.242105,0.007466
4,1,1,1,0,0,0,0.429858,0.572366,-0.219099,0.099257,-0.306285,0.381387,-0.251876
...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,0,132,0,0,0,1,-0.299856,-0.476798,0.359475,-1.355140,0.916980,0.078428,0.092393
505,0,133,0,0,1,0,-0.142799,-0.216933,0.163394,-0.981917,0.479130,0.578372,-0.237788
506,0,133,0,0,0,1,2.536279,2.041188,-0.213379,-0.775697,0.108159,1.750352,-0.224619
507,0,134,0,0,1,0,0.174808,-0.060616,-0.020451,-0.648059,0.384633,0.478642,-0.165981
