# NOT WORKING AT THE MOMENT (APRIL 2025)

_ANNOTATIONS NOT APPEARING ON VIDOES IN THE FIFTYONE GUI_
_AUDIO NOT PLAYING IN THE FIFTYONE GUI_

# 7 Visualise activity in a video.



We have extracted all the features we plan to use. Overlaying these on the video was useful.
But watching annotated videos is inefficient and not always informative.. 

To help with understanding we build a few tools that let's see at a glance what happens over time.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ultralytics
import fiftyone as fo
import logging
import sys

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# Add project root to path and import utils
project_root = os.path.join("..")
sys.path.append(project_root)

from src.config import PATH_CONFIG
from src.utils.io_utils import getProcessedVideos, saveProcessedVideos, getFaceData, getSpeechData, getKeyPoints, getVideoProperty
from src.utils.notebook_utils import display_config_info, ensure_dir_exists
from src.utils.keypoint_utils import normalize_keypoints
from src.processors.keypoint_processor import process_keypoints_for_modeling
from src.processors.face_processor import normalize_facial_keypoints, match_faces_to_poses

# Get paths from config
videos_in = PATH_CONFIG['videos_in']
data_out = PATH_CONFIG['data_out']

# Ensure output directory exists
if ensure_dir_exists(data_out):
    print(f"Created output directory: {data_out}")

# Display configuration information
display_config_info(videos_in, data_out, "Processing Configuration")

# Use the configured filename from PATH_CONFIG
processedvideos = getProcessedVideos(data_out)
processedvideos.head()

# 7.1 Use FiftyOne for examining videos

FiftyOne is a powerful tool for visualizing, exploring, and analyzing media datasets. We'll use it to view our videos with all the extracted features overlaid.

Reference: https://docs.voxel51.com/user_guide/dataset_creation/index.html

## 7.1.1 Define helper functions

These functions will help us manage our FiftyOne dataset and add annotations.

In [None]:
def idx2person(idx):
    """Convert index to person label"""
    idx = int(idx) if not isinstance(idx, str) else idx
    if idx == 0 or idx == "0" or idx == "child":
        return "Child"
    elif idx == 1 or idx == "1" or idx == "adult":
        return "Adult"
    else:
        return "Unknown"

def framerange_from_timestamps(timestamps, fps, max_frames):
    """Convert time stamps to frame numbers"""
    start = max(int(timestamps[0]*fps)+1, 1)
    end = min(int(timestamps[1]*fps)+1, max_frames)
    return start, end

def xyxy2ltwh(bbox):
    """Convert bounding box from [x1, y1, x2, y2] to [left, top, width, height]"""
    return [
        bbox[0],              # left
        bbox[1],              # top
        bbox[2] - bbox[0],    # width
        bbox[3] - bbox[1]     # height
    ]

In [None]:
def create_dataset(videos_dir, dataset_name="BabyJokes"):
    """Create a new FiftyOne dataset or load existing one"""
    logging.info(f"Creating/Loading FiftyOne dataset: {dataset_name}")
    
    # Check if dataset exists
    if dataset_name in fo.list_datasets():
        logging.info(f"Loading existing dataset: {dataset_name}")
        dataset = fo.load_dataset(dataset_name)
    else:
        # Create new dataset
        logging.info(f"Creating new dataset: {dataset_name}")
        dataset = fo.Dataset(dataset_name)
    
        # Add videos
        logging.info(f"Adding videos from {videos_dir}")
        video_paths = [os.path.join(videos_dir, f) for f in os.listdir(videos_dir) if f.endswith((".mp4", ".avi", ".mov"))]
    
        for video_path in video_paths:
            try:
                sample = fo.Sample(filepath=video_path)
                dataset.add_sample(sample)
                logging.info(f"Added video {video_path} to dataset")
            except Exception as e:
                logging.error(f"Error adding video {video_path}: {str(e)}")
            
        # Ensure frames are extracted and metadata is computed
        logging.info("Extracting frames and computing metadata...")
        dataset.compute_metadata()
        dataset.ensure_frames()
    
        # Add sample fields for metadata
        dataset.add_sample_field("VideoID", fo.StringField, description="Video identifier")
        dataset.add_sample_field("JokeType", fo.StringField, description="What joke is being told?")
        dataset.add_sample_field("HowFunny", fo.StringField, description="How funny is the joke?")
        dataset.add_sample_field("LaughYesNo", fo.BooleanField, description="Did the child laugh?")
        dataset.add_sample_field("ChildSide", fo.IntField, description="Is the child on left (-1) or right (1) of adult or on lap (0)?")
    
        # Add frame field for people detections using proper field type
        dataset.add_frame_field(
            "People", 
            fo.EmbeddedDocumentField,
            embedded_doc_type=fo.Detections,
            description="People detections"
        )
    
        # Add frame field for pose keypoints
        dataset.add_frame_field(
            "Poses",
            fo.EmbeddedDocumentField,
            embedded_doc_type=fo.Keypoints,
            description="Human pose keypoints"
        )
    
        # Add frame field for dominant emotion
        dataset.add_frame_field(
            "DominantEmotion",
            fo.StringField,
            description="Dominant facial emotion"
        )
    
        # Add frame field for speech captions
        dataset.add_frame_field(
            "Captions",
            fo.StringField,
            description="Speech captions"
        )
    
        logging.info(f"Created dataset with {len(dataset)} videos")
    return dataset

In [None]:
def add_metadata_to_samples(dataset, processedvideos):
    """Add video metadata (joke type, how funny, etc.) to samples"""
    logging.info("Adding metadata to samples...")
    
    for sample in dataset:
        try:
            videoname = os.path.basename(sample.filepath)
            phrase = processedvideos[processedvideos["VideoID"]==videoname]
            if len(phrase) == 0:
                logging.warning(f"Video {videoname} not found in processed videos.")
                continue
            sample["VideoID"] = phrase["VideoID"].values[0]
            sample["JokeType"] = phrase["JokeType"].values[0]
            sample["HowFunny"] = phrase["HowFunny"].values[0]
            sample["LaughYesNo"] = (phrase["LaughYesNo"].values[0] == "Yes")
            sample.save()
            logging.info(f"Added metadata to {videoname}")
        except Exception as e:
            logging.error(f"Error adding metadata to {sample.filepath}: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
    return dataset

In [None]:
def add_people_bounding_boxes(dataset, processedvideos):
    """Add people bounding boxes and pose keypoints as frame-level detections"""
    logging.info("Adding people bounding boxes and poses...")
    
    for sample in dataset:
        try:
            videoname = os.path.basename(sample.filepath)
            keypoints = getKeyPoints(processedvideos, videoname)
            if keypoints is None or keypoints.empty:
                logging.warning(f"No keypoints found for {videoname}")
                continue
            
            for frame_number, frame in sample.frames.items():
                rows = keypoints[keypoints["frame"] == frame_number - 1]
                if rows.empty:
                    continue
                
                detections = []
                pose_keypoints = []
                for _, row in rows.iterrows():
                    if 'bbox.x1' not in row or pd.isna(row['bbox.x1']):
                        continue
                    bbox = [row["bbox.x1"], row["bbox.y1"], row["bbox.x2"], row["bbox.y2"]]
                    bbox51 = xyxy2ltwh(bbox)
                    detection = fo.Detection(
                        label=idx2person(row["person"]),
                        bounding_box=bbox51,
                        confidence=row.get("confidence", 0.9)
                    )
                    detections.append(detection)
                    
                    # Add keypoints if available
                    keypoint_columns = [col for col in row.index if col.startswith("x_") or col.startswith("y_")]
                    if keypoint_columns:
                        points = []
                        num_keypoints = len(keypoint_columns) // 2
                        for i in range(num_keypoints):
                            x_col = f"x_{i}"
                            y_col = f"y_{i}"
                            if x_col in row and y_col in row and not pd.isna(row[x_col]) and not pd.isna(row[y_col]):
                                points.append([row[x_col], row[y_col], 2])  # [x, y, visibility]
                        if points:
                            keypoints_obj = fo.Keypoint(label=idx2person(row["person"]), points=points)
                            pose_keypoints.append(keypoints_obj)
                
                if detections:
                    frame["People"] = fo.Detections(detections=detections)
                if pose_keypoints:
                    frame["Poses"] = fo.Keypoints(keypoints=pose_keypoints)
            sample.save()
            logging.info(f"Added people detections and poses to {videoname}")
        except Exception as e:
            logging.error(f"Error processing {sample.filepath}: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
    return dataset

In [None]:
def add_speech_annotations(dataset, processedvideos):
    """Add speech transcripts as frame-level annotations"""
    logging.info("Adding speech annotations...")
    
    for sample in dataset:
        try:
            videoname = os.path.basename(sample.filepath)
            speechdata = getSpeechData(processedvideos, videoname)
            if speechdata is None or "segments" not in speechdata:
                logging.warning(f"No speech data found for {videoname}")
                continue
            
            if not hasattr(sample, 'metadata') or sample.metadata is None or 'frame_rate' not in sample.metadata or 'total_frame_count' not in sample.metadata:
                logging.warning(f"Missing metadata for {videoname}")
                continue
            
            fps = sample.metadata["frame_rate"]
            max_frames = sample.metadata["total_frame_count"]
            
            for segment in speechdata["segments"]:
                start_frame, end_frame = framerange_from_timestamps(
                    [segment["start"], segment["end"]], fps, max_frames
                )
                for frame_number in range(start_frame, end_frame + 1):
                    if frame_number in sample.frames:
                        sample.frames[frame_number]["Captions"] = segment["text"]
            sample.save()
            logging.info(f"Added speech annotations to {videoname}")
        except Exception as e:
            logging.error(f"Error adding speech to {sample.filepath}: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
    return dataset

In [None]:
def add_dominant_emotion(dataset, processedvideos):
    """Add dominant facial emotion to each frame, handling multiple faces"""
    logging.info("Adding dominant facial emotions...")
    
    for sample in dataset:
        try:
            videoname = os.path.basename(sample.filepath)
            emotions = getFaceData(processedvideos, videoname)
            if emotions is None or emotions.empty:
                logging.warning(f"No emotion data found for {videoname}")
                continue
            
            for frame_number, frame in sample.frames.items():
                frame_emotions = emotions[emotions['frame'] == frame_number - 1]
                if frame_emotions.empty:
                    continue
                
                detections = []
                for face_id in frame_emotions['face_id'].unique():
                    face_emotions = frame_emotions[frame_emotions['face_id'] == face_id]
                    if face_emotions.empty:
                        continue
                    
                    # Find the dominant emotion for the face
                    dominant_emotion = face_emotions['dominant_emotion'].iloc[0]
                    
                    # Get bounding box
                    x = face_emotions['x'].iloc[0]
                    y = face_emotions['y'].iloc[0]
                    w = face_emotions['w'].iloc[0]
                    h = face_emotions['h'].iloc[0]
                    bbox = [x, y, w, h]
                    
                    # Create detection
                    detection = fo.Detection(
                        label=f"{idx2person(face_id)}: {dominant_emotion}",
                        bounding_box=xyxy2ltwh( [x, y, x+w, y+h] )  # Convert to FiftyOne format
                    )
                    detections.append(detection)
                
                if detections:
                    frame["People"] = fo.Detections(detections=detections)
            sample.save()
            logging.info(f"Added dominant emotions to {videoname}")
        except Exception as e:
            logging.error(f"Error adding dominant emotions to {sample.filepath}: {str(e)}")
            import traceback
            logging.error(traceback.format_exc())
    return dataset

## 7.1.2 Create or load FiftyOne dataset

Let's check if we have an existing dataset. If not, we'll create one.

In [None]:
# Define our dataset name
DATASET_NAME = "BabyJokes"

# Check for existing datasets
datasets = fo.list_datasets()
print(f"Available datasets: {datasets}")

# Do we want to delete existing datasets?
delete_existing = True  # Set to True to recreate the dataset from scratch
if delete_existing and DATASET_NAME in datasets:
    print(f"Deleting existing dataset: {DATASET_NAME}")
    fo.delete_dataset(DATASET_NAME)
    datasets = fo.list_datasets()

# Create or load our dataset
dataset = create_dataset(videos_in, DATASET_NAME)

## 7.1.3 Add metadata and annotations to the dataset

Now let's populate our dataset with all the metadata and annotations:

In [None]:
# Apply functions to add metadata and annotations
dataset = add_metadata_to_samples(dataset, processedvideos)
dataset = add_people_bounding_boxes(dataset, processedvideos)
dataset = add_speech_annotations(dataset, processedvideos)
dataset = add_dominant_emotion(dataset, processedvideos)

# Print dataset information after applying functions
print(dataset)

## 7.1.4 View dataset in FiftyOne App

Now we can visualize our dataset with all annotations in the FiftyOne App.

In [None]:
# Create a view that filters samples with temporal detections
from fiftyone import F

# Filter samples that have temporal detections
temp_view = dataset.match(F.field("temporal_detections").exists())
print(f"Found {len(temp_view)} samples with speech temporal detections")

# Filter samples that have emotion detections
emotion_view = dataset.match(F.field("emotion_detections").exists())
print(f"Found {len(emotion_view)} samples with emotion temporal detections")

# Update the session view to show only samples with temporal data
combined_view = dataset.match(
    F.field("temporal_detections").exists() | F.field("emotion_detections").exists()
)
print(f"Found {len(combined_view)} samples with any temporal detections")

# To update the current session view, uncomment:
# session.view = combined_view

In [None]:
# let's launch the FiftyOne app to visualize our dataset
try:
    session = fo.launch_app(dataset)
except Exception as e:
    logging.error(f"Error launching FiftyOne app: {e}")
    import traceback
    logging.error(traceback.format_exc())