# 3 Extract facial data from videos.

We use a range of libraries to extract facial data from the videos. The main library is [DeepFace](https://github.com/serengil/deepface) but we also considered FER - [Facial Expression Recognition](https://github.com/justinshenk/fer).

DeepFace is a framework that wraps several popular face recognition models, accessible as a single API. These backends are 

backends = [ 'opencv', 'retinaface', 'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8', 'centerface']

For demographics (age, gender,race and emotion), it's unclear how many backends are available or if deepface has its own models. But performance can depend upon the recognition dmodel used. 


`pip install deepface`



## 3.1 DeepFace



In [None]:
from deepface import DeepFace
import pandas as pd
import pprint
import cv2
import os
import time
import matplotlib.pyplot as plt

import sys

project_root = os.path.join("..")
sys.path.append(project_root)

from src.processors.face_processor import extract_faces_from_video, get_facial_stats
from src.utils.io_utils import getProcessedVideos, saveProcessedVideos

## Get paths for data

In [None]:
# Add these to your imports
from src.config import PATH_CONFIG
from src.utils.notebook_utils import display_config_info, ensure_dir_exists

# Get paths from config
videos_in = PATH_CONFIG['videos_in']
data_out = PATH_CONFIG['data_out']

# Ensure output directory exists
if ensure_dir_exists(data_out):
    print(f"Created output directory: {data_out}")

# Display configuration information
display_config_info(videos_in, data_out, "Processing Configuration")


## Full data

In [None]:
# different face detection models built into deepface
detector_backends = [ 'opencv', 'retinaface', 'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8', 'centerface']
backends = [
  'opencv', 
  'ssd', 
  'dlib', 
  'mtcnn', 
  'retinaface', 
  'mediapipe',
  'yolov8',
  'yunet',
]

In [None]:
processedvideos = getProcessedVideos(data_out)
processedvideos.head()

## Method A: Process All Videos at Once

In [None]:
# Add a column to track Method A processing if it doesn't exist
from src.processors.face_processor import process_video_faces


if "Face_Processing_Complete" not in processedvideos.columns:
    processedvideos["Face_Processing_Complete"] = False

# Parameters
force_process = True  # Set to True to reprocess already processed videos
backend = "retinaface"        # Face detection backend to use
skip_frames = 0        # Process every frame (set higher to skip frames)

# Process each video
for index, row in processedvideos.iterrows():
    if force_process or pd.isnull(row.get("Face_Processing_Complete")) or not row.get("Face_Processing_Complete"):
        video_path = os.path.join(videos_in, row["VideoID"])
        
        # Get video metadata for normalization
        video_metadata = {
            "Height": row["Height"], 
            "Width": row["Width"]
        }
        
        # Load pose data if available for matching
        # TODO - maybe make this wrok later.
        poses_df = None
        # if not pd.isnull(row.get("Keypoints.file")) and os.path.exists(row["Keypoints.file"]):
        #     try:
        #         poses_df = pd.read_csv(row["Keypoints.file"])
        #         print(f"Loaded pose data for {row['VideoID']} with {len(poses_df)} records")
        #     except Exception as e:
        #         print(f"Error loading pose data: {e}")
        
        # Process the video with all steps
        print(f"Processing {row['VideoID']} using Method A...")
        try:
            results = process_video_faces(
                video_path=video_path,
                output_dir=data_out,
                video_metadata=video_metadata,
                poses_df=poses_df,
                skip_frames=skip_frames,
                backend=backend,
                force_process=force_process
            )
            
            # Update the dataframe with results
            if results["faces"]:
                processedvideos.at[index, "Faces.file"] = results["faces"]
                processedvideos.at[index, "Faces.when"] = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
            
            if results["normed"]:
                processedvideos.at[index, "Faces.normed"] = results["normed"]
                
            if results["matched"]:
                processedvideos.at[index, "Faces.matched"] = results["matched"]
                
            processedvideos.at[index, "Face_Processing_Complete"] = True
            print(f"✅ Completed processing for {row['VideoID']}")
        
            # Save updated processedvideos
            saveProcessedVideos(processedvideos, data_out)
        except Exception as e:
            print(f"❌ Error processing {row['VideoID']}: {e}")
            
    else:
        print(f"Already processed {row['VideoID']} (skipping)")



# Display updated dataframe
processedvideos[["VideoID", "Faces.file", "Faces.normed", "Face_Processing_Complete"]].head()

## METHOD B - Step by step

## First Detect faces and save info to csv (emotion, gender, age, race)

In [None]:
# Process each video and extract facial features
forceProcess = False
backend = "retinaface"  # Change to the desired backend
features = ['emotion','age','gender']

for index, row in processedvideos.iterrows():
    if forceProcess or pd.isnull(row.get("Face_Processing_Complete")) or not row.get("Face_Processing_Complete"):
        video_path = os.path.join(videos_in, row["VideoID"])
        video_name = os.path.basename(video_path)
        base_name = os.path.splitext(video_name)[0]
        faces_path = os.path.join(data_out, f"{base_name}_faces_{backend}.csv")
        
        # Get video metadata for normalization
        video_metadata = {
            "Height": row["Height"], 
            "Width": row["Width"]
        }
        
        # Process the video with DeepFace
        print(f"Processing {row['VideoID']} using DeepFace...")
        try:
            results = extract_faces_from_video(
                video_path=video_path,
                output_file=faces_path,
                backend = backend,
                skip_frames=0,
                features = features
            )            
            # Update the dataframe with results
            if results["faces"]:
                processedvideos.at[index, "Faces.file"] = results["faces"]
                processedvideos.at[index, "Faces.when"] = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
                            
            processedvideos.at[index, "Face_Processing_Complete"] = True
            print(f"✅ Completed processing for {row['VideoID']}")
        
        except Exception as e:
            print(f"❌ Error processing {row['VideoID']}: {str(e)}")
            
    else:
        print(f"Already processed {row['VideoID']} (skipping)")




## Match face detection with pose detection

In [None]:
backends


In [None]:
# Add Faces.matched column to processedvideos if it doesn't exist
if "Faces.matched" not in processedvideos.columns:
    processedvideos["Faces.matched"] = None

# Process each video to match faces with poses
forceProcess = False

for index, row in processedvideos.iterrows():
    if (forceProcess or pd.isnull(row["Faces.matched"])) and not pd.isnull(row["Faces.file"]) \
       and not pd.isnull(row["Keypoints.file"]):
        try:
            # Load faces and keypoints dataframes
            faces_df = pd.read_csv(row["Faces.file"])
            poses_df = pd.read_csv(row["Keypoints.file"])
            
            # Match faces to poses
            matched_df = match_faces_to_poses(faces_df, poses_df)
            
            # Save matched data
            stemname = os.path.splitext(row["Faces.file"])[0]
            matched_path = f"{stemname}_matched.csv"
            matched_df.to_csv(matched_path, index=False)
            
            # Update record
            processedvideos.at[index, "Faces.matched"] = matched_path
            print(f"Matched faces to poses for {row['VideoID']}")
        except Exception as e:
            print(f"Error matching faces for {row['VideoID']}: {e}")
    elif not pd.isnull(row["Faces.matched"]):
        print(f"Already matched faces for {row['VideoID']}")
        
saveProcessedVideos(processedvideos, data_out)

## Normalize facial coordinates

In [None]:
def normalize_facial_keypoints(faces_df, height, width):
    """
    Normalize facial keypoint coordinates by dividing by video dimensions.
    
    Args:
        faces_df (DataFrame): DataFrame with facial keypoints
        height (int): Video height
        width (int): Video width
    
    Returns:
        DataFrame: DataFrame with normalized coordinates
    """
    # Create a copy to avoid modifying the original
    normed_df = faces_df.copy()
    
    # Get the x and y column names for facial landmarks
    facecolsx, facecolsy = utils.getfacecols()
    
    # Normalize x coordinates by dividing by width
    for col in facecolsx:
        if col in normed_df.columns:
            normed_df[col] = normed_df[col] / width
    
    # Normalize y coordinates by dividing by height
    for col in facecolsy:
        if col in normed_df.columns:
            normed_df[col] = normed_df[col] / height
    
    return normed_df

# Process each video to normalize facial coordinates
forceNormalize = False

for index, row in processedvideos.iterrows():
    if (forceNormalize or pd.isnull(row.get("Faces.normed"))) and not pd.isnull(row["Faces.file"]):
        try:
            # Load faces dataframe
            faces_df = pd.read_csv(row["Faces.file"])
            
            # Normalize coordinates
            normed_df = normalize_facial_keypoints(faces_df, row["Height"], row["Width"])
            
            # Save normalized data
            stemname = os.path.splitext(row["Faces.file"])[0]
            normed_path = f"{stemname}_normed.csv"
            normed_df.to_csv(normed_path, index=False)
            
            # Update record
            processedvideos.at[index, "Faces.normed"] = normed_path
            print(f"Normalized facial data for {row['VideoID']}")
        except Exception as e:
            print(f"Error normalizing faces for {row['VideoID']}: {e}")
    elif not pd.isnull(row.get("Faces.normed")):
        print(f"Already normalized faces for {row['VideoID']}")

utils.saveProcessedVideos(processedvideos, data_out)

## Analyze Facial Data

In [None]:
# Add a cell to generate statistics and visualize facial data
for index, row in processedvideos.iterrows():
    if not pd.isnull(row["Faces.file"]) and os.path.exists(row["Faces.file"]):
        try:
            # Load faces dataframe
            faces_df = pd.read_csv(row["Faces.file"])
            
            # Get statistics
            stats = get_facial_stats(faces_df)
            print(f"\nFacial Stats for {row['VideoID']}:")
            print(f"Total faces detected: {stats['total_faces']}")
            print(f"Frames with faces: {stats['unique_frames']}")
            print(f"Average faces per frame: {stats['avg_faces_per_frame']:.2f}")
            
            # Display emotion distribution
            if 'emotion_distribution' in stats:
                print("\nEmotion distribution:")
                emotions = stats['emotion_distribution']
                for emotion, value in emotions.items():
                    print(f"  {emotion}: {value:.2f}")
                    
                # Plot emotions
                plt.figure(figsize=(10, 5))
                plt.bar(emotions.keys(), emotions.values())
                plt.title(f"Emotion Distribution for {row['VideoID']}")
                plt.ylabel("Average Score")
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.show()
                
            print("-" * 50)
        except Exception as e:
            print(f"Error analyzing facial data for {row['VideoID']}: {e}")