# 7 Visualise activity in a video.

We have extracted all the features we plan to use. Overlaying these on the video was useful.
But watching annotated videos is inefficient and not always informative.. 

To help with understanding we build a few tools that let's see at a glance what happens over time.

In [1]:
import os
import utils
import calcs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ultralytics

In [2]:
videos_in = os.path.join("..","LookitLaughter.test")
demo_data = os.path.join("..","data", "demo")
temp_out = os.path.join("..","data","0_temp")
data_out = os.path.join("..","data","1_interim")
videos_out = os.path.join("..","data","2_final")

metadata_file = "_LookitLaughter.xlsx"

#a couple of files for testing
VIDEO_FILE  = os.path.join(videos_in, "2UWdXP.joke1.rep2.take1.Peekaboo.mp4")
VIDEO_FILE2 = os.path.join(videos_in, "2UWdXP.joke2.rep1.take1.NomNomNom.mp4")
AUDIO_FILE = os.path.join(data_out, "2UWdXP.joke1.rep2.take1.Peekaboo.wav")
SPEECH_FILE = os.path.join(data_out, "2UWdXP.joke1.rep2.take1.Peekaboo.json")

testset = [VIDEO_FILE, VIDEO_FILE2] 

processedvideos = utils.getprocessedvideos(data_out)
processedvideos.head()

Found existing processedvideos.xlsx


Unnamed: 0,VideoID,ChildID,JokeType,JokeNum,JokeRep,JokeTake,HowFunny,LaughYesNo,Frames,FPS,...,Speech.file,Speech.when,Objects.file,Objects.when,Understand.file,Understand.when,Faces.normed,Keypoints.normed,annotatedVideo,annotated.when
0,2UWdXP.joke1.rep2.take1.Peekaboo.mp4,2UWdXP,Peekaboo,1,2,1,Slightly funny,No,217,14.29891,...,../data/1_interim/2UWdXP.joke1.rep2.take1.Peek...,2023-09-20 16:58:38,,,,,../data/1_interim/2UWdXP.joke1.rep2.take1.Peek...,../data/1_interim/2UWdXP.joke1.rep2.take1.Peek...,../data/2_final/2UWdXP.joke1.rep2.take1.Peekab...,2024-02-16 11:03:50
1,2UWdXP.joke1.rep3.take1.Peekaboo.mp4,2UWdXP,Peekaboo,1,3,1,Slightly funny,No,152,14.359089,...,../data/1_interim/2UWdXP.joke1.rep3.take1.Peek...,2023-09-20 16:58:39,,,,,../data/1_interim/2UWdXP.joke1.rep3.take1.Peek...,../data/1_interim/2UWdXP.joke1.rep3.take1.Peek...,../data/2_final/2UWdXP.joke1.rep3.take1.Peekab...,2024-02-16 11:03:51
2,2UWdXP.joke2.rep1.take1.NomNomNom.mp4,2UWdXP,NomNomNom,2,1,1,Funny,No,95,13.241315,...,../data/1_interim/2UWdXP.joke2.rep1.take1.NomN...,2023-09-20 16:58:40,,,,,../data/1_interim/2UWdXP.joke2.rep1.take1.NomN...,../data/1_interim/2UWdXP.joke2.rep1.take1.NomN...,../data/2_final/2UWdXP.joke2.rep1.take1.NomNom...,2024-02-16 11:03:52
3,2UWdXP.joke2.rep2.take1.NomNomNom.mp4,2UWdXP,NomNomNom,2,2,1,Slightly funny,No,97,14.213813,...,../data/1_interim/2UWdXP.joke2.rep2.take1.NomN...,2023-09-20 16:58:40,,,,,../data/1_interim/2UWdXP.joke2.rep2.take1.NomN...,../data/1_interim/2UWdXP.joke2.rep2.take1.NomN...,../data/2_final/2UWdXP.joke2.rep2.take1.NomNom...,2024-02-16 11:03:53
4,2UWdXP.joke2.rep3.take1.NomNomNom.mp4,2UWdXP,NomNomNom,2,3,1,Slightly funny,No,133,14.223092,...,../data/1_interim/2UWdXP.joke2.rep3.take1.NomN...,2023-09-20 16:58:48,,,,,../data/1_interim/2UWdXP.joke2.rep3.take1.NomN...,../data/1_interim/2UWdXP.joke2.rep3.take1.NomN...,../data/2_final/2UWdXP.joke2.rep3.take1.NomNom...,2024-02-16 11:03:54


# 7.1 Use Voxel51 and PytorchVideo for examining videos

Voxel51 seems to be a useful tool for looking at training data (and trained predictions).

Let's start with the minimal implementation. Just viewing videos.

https://docs.voxel51.com/user_guide/dataset_creation/index.html



In [3]:
import fiftyone as fo

### 7.1.1 Is dataset already created?

In [4]:
fo.list_datasets()

['LookitLaughter.test']

### 7.1.2 Populate a FiftyOne dataset with our videos and labels.

In [5]:
fo.delete_datasets("*")

In [6]:
# Create a dataset from a directory of videos
dataset = fo.Dataset.from_videos_dir("../LookitLaughter.test")
dataset.ensure_frames()

dataset.name = 'LookitLaughter.test'


dataset.add_sample_field("JokeType", fo.StringField, description="What joke is being told?")
dataset.add_sample_field("HowFunny", fo.StringField, description="How funny is the joke?")
dataset.add_sample_field("LaughYesNo",  fo.BooleanField, description="Did the child laugh?")

 100% |███████████████████| 54/54 [44.9ms elapsed, 0s remaining, 1.2K samples/s]   
Computing metadata...
 100% |███████████████████| 54/54 [1.1s elapsed, 0s remaining, 48.5 samples/s]         


Now let's see if we can add our metadata classifications. Recalling that each video demos one joke type `[Peekaboo,TearingPaper,NomNomNom,ThatsNotAHat,ThatsNotACat]` and has rating of how funny the baby found it `[Not Funny, Slightly Funny, Funny, Extremely Funny]` and whether they laughed `[Yes, No]`.


In [7]:
# add the joke type, how funny and laugh yes/no for each sample in the dataset
for sample in dataset:
    #split the filepath to get the video name, system independent
    videoname = os.path.basename(sample.filepath)
    row = processedvideos[processedvideos["VideoID"]==videoname]
    sample["VideoID"]  = row["VideoID"].values[0]
    sample["JokeType"]  = row["JokeType"].values[0]
    sample["HowFunny"]  = row["HowFunny"].values[0]
    sample["LaughYesNo"]  = (row["LaughYesNo"].values[0] == "Yes")
    sample.save()

Let's add the frame by frame annotations directly onto the videos inside fiftyone

In [8]:
#Let's start with people bounding boxes

for sample in dataset:
    #retrieve people bounding boxes from the keypoints file
    keypoints = utils.readKeyPointsFromCSV(processedvideos,sample.filepath,normed= True)    

    for index, row in keypoints.iterrows():
        framenumber = row["frame"] + 1
        person = row["person"]
        bbox = [row["bbox.x1"], row["bbox.y1"], row["bbox.x2"], row["bbox.y2"]]
        bbox51 = calcs.xyxy2ltwh(bbox)
        if sample.frames[framenumber]:
            frame = sample.frames[framenumber]
        else:
            frame = fo.Frame()
        frame[person] = fo.Detection(label=person, bounding_box=bbox51)
        sample.frames[framenumber] = frame
        #TODO fiftyone add keypoints not well documented and i can't get it to work. 
        #frame[person + "KeyPoints"] =  
        #fo.KeypointSkeleton()
        #TODO can't see how to add timesynced captions either!

        sample.save()

## 7.2 View dataset in Voxel51 GUI

In [9]:
session = fo.launch_app(dataset, address="0.0.0.0")

In [10]:
print(session.selected)

[]


In [12]:
#session.selected contains the indices of the dataset samples clicked on in the UI.
if len(session.selected) == 0:
    print("No samples selected. Click the checkbox in the top left of each video to select it.")
else:
    print(dataset[session.selected[0]])

No samples selected. Click the checkbox in the top left of each video to select it.


# 7.2 Draw annotated timeline for a select video 

A group of visualisations to see what happens in a video. 

In each frame let's find the `centre of gravity` for each person (the average of all the high-confidence marker points). This is handy for time series visualisation. For example plotting the cog.x for each person over time shows how they move closer and further from each other. 

Let's get the keypoint data and calculate

In [None]:
FPS = utils.getVideoProperty(processedvideos, VIDEO_FILE2, "FPS")
xmax = keypoints["frame"].max()

emotionColors = {"angry":{"color":"red","arousal":0.7,"valence":-0.7},
                 "afraid":{"color":"orange","arousal":0.8,"valence":-0.2},
                 "happy":{"color":"yellow","arousal":0.2,"valence":0.9},
                 "neutral":{"color":"gray","arousal":0,"valence":0},
                 "sad":{"color":"blue","arousal":-0.2,"valence":-0.9}}
who = ["child", "adult"]

In [None]:
plotCoGrav = True
plotSpeech = True
plotEmotions = True

#numerical sum of boolean flags
subplots = sum([plotCoGrav, plotSpeech, plotEmotions])

if len(session.selected) == 0:
    print("No video selected")
    exit()

VideoID = dataset[session.selected[0]]["VideoID"]
keypoints = utils.readKeyPointsFromCSV(processedvideos,VideoID)

#this bit of pandas magic calculates average x and y for all the rows.
keypoints[["cogx","cogy"]] = keypoints.apply(lambda row: calcs.rowcogs(row.iloc[8:59]), axis=1, result_type='expand')

#going to add a subplot foe each of the above flags
plt.figure(figsize=(20, 5*subplots))
plt.suptitle("Video Time Line Plots")
pltidx = 0
if plotCoGrav:
    ax = plt.subplot(subplots, 1, pltidx + 1)
    pltidx += 1
    ax.set_xlabel("Time (seconds)")
    ax.set_ylabel("Horizontal Position")
    ax.set_xlim(0, child["frame"].max()/FPS)
    child = keypoints[keypoints["person"]=="child"]
    adult = keypoints[keypoints["person"]=="adult"]
    #a plot of child's centre of gravity frame by frame
    childplot = ax.plot(child["frame"], child["cogx"], c="red", alpha=0.5)
    ## add line of adult's centre of gravity
    adultplot = ax.plot(adult["frame"], adult["cogx"], c="blue", alpha=0.5)
    #add legend
    ax.legend(['child', 'adult'], loc='upper left')

if plotSpeech:
    ax2 = plt.subplot(subplots, 1, pltidx + 1)
    pltidx += 1
    ax2.set_xlabel("Time (seconds)")
    ax2.set_ylabel("Identified Speech")
    speechjson = utils.getSpeechData(processedvideos,VideoID)
    if speechjson is not None:
        nsegs = len(speechjson["segments"])
        ax2.set_xlim(0, child["frame"].max()/FPS)
        ax2.set_ylim(0, nsegs)
        #let's plot the speech segments as boxes
        #label each one with the text
        for idx, seg in enumerate(speechjson["segments"]):
            # #rectangle with the start and end times as x coordinates and nsegs - idx as y coordinates
            #fill the rectangle
            ax2.fill([seg["start"], seg["end"], seg["end"], seg["start"]], [nsegs - idx - 1, nsegs - idx - 1, nsegs - idx, nsegs - idx], 'r', alpha=0.5)
            ax2.text(seg["start"], nsegs- idx -.5 , seg["text"])

if plotEmotions:
    pltidx += 1
    ax3.set_xlabel("Time (seconds)")
    ax3.set_xlin(0, child["frame"].max()/FPS)  
    emotions = utils.getFaceData(processedvideos,VideoID)
    emotions["ticker"] = 1

fig, ax = plt.subplots(2,1)
plt.xlim(0,xmax)

for index in range(2):
    ems = emotions[emotions["index"]==index]
    #who is the person we are plotting
    # key gives the emotion name, data gives the actual values (also labels)
    for key, data in ems.groupby('emotion'):
        #plot scatter plot of emotion occurances
        ax[index].scatter(data["frame"], data["ticker"], label=key, c=emotionColors[key], alpha=0.5, s=100)
        ax[index].set_title(who[index] + " emotions")
#show legend with emotion colours
plt.legend(loc='best')


plt.show()




In [None]:
#out of curiousity, let's plot the location of the adult and child's nose

childnoseplot = plt.plot(child["frame"], child["nose.x"], c="red", alpha=0.5)
adultnoseplot = plt.plot(adult["frame"], adult["nose.x"], c="blue", alpha=0.5)
plt.legend(['child', 'adult'], loc='upper left')

In [None]:

#suppress non plot outputs

childnoseplot = plt.plot(child["frame"], child["nose.x"], c="red", alpha=0.5)
adultnoseplot = plt.plot(adult["frame"], adult["nose.x"], c="blue", alpha=0.5)
plt.legend(['child', 'adult'], loc='upper left')
plt.xlabel('Time (s)')
plt.ylabel('X position of nose (horizontal pixels)')
#add xticks in seconds (one second is every frame number/FPS)
plt.xticks(np.arange(0, child["frame"].max(), FPS), np.arange(0, child["frame"].max()/FPS, 1))
plt.show()


Let's plot the captions.
Go through the speechjson. For each speech segment add a horizotal line with the text. Start and End times from the speechjson.

Now let's do a timeline for the emotions of the participants.
We'll experiment to find best visualisation. 
Note this assumes that faces are correctly assigned to correct indviduals. 
TODO - Code that uses bounding boxes to assign faces to individuals.

First we will try a 'scatter' graph. Color coded for each emotion. 

In [None]:
videoname = os.path.basename(VIDEO_FILE2)

emotions = utils.getFaceData(processedvideos,videoname)
emotions["ticker"] = 1

fig, ax = plt.subplots(2,1)
plt.xlim(0,xmax)

for index in range(2):
    ems = emotions[emotions["index"]==index]
    #who is the person we are plotting
    # key gives the emotion name, data gives the actual values (also labels)
    for key, data in ems.groupby('emotion'):
        #plot scatter plot of emotion occurances
        ax[index].scatter(data["frame"], data["ticker"], label=key, c=emotionColors[key], alpha=0.5, s=100)
        ax[index].set_title(who[index] + " emotions")
#show legend with emotion colours
plt.legend(loc='best')



In [None]:
#add a playback of the video. with a slider to move through the frames and a play/pause button to play the video.
#make sure we show frame counter
#use ipywidgets for this

import ipywidgets as widgets
from IPython.display import display

annotatedVideo = utils.getVideoProperty(processedvideos,VIDEO_FILE2,"annotatedVideo")

if annotatedVideo is not None:
    #get the video
    #video = ultralytics.Video(annotatedVideo)
    #get the number of frames
    nframes = 230
    #create a slider
    slider = widgets.IntSlider(min=0, max=nframes, step=1, value=0)
    #create a play button
    play = widgets.Play(min=0, max=nframes, step=1, interval=1000)
    #link the slider and play button
    widgets.jslink((play, 'value'), (slider, 'value'))
    #create a video player using cv2
        
    #create a VBox to hold the video player, slider and play button
    widgets.VBox([videoPlayer, widgets.HBox([play, slider])])

# need to control videoplayer with slider and play button
# Create a callback function to update the video player
def update_video(value):
    videoPlayer.seek(value)

# Link the slider and play button to the callback function
slider.observe(update_video, 'value')
play.observe(update_video, 'value')

# create a VBox to hold the video player, slider, and play button
widget_box = widgets.VBox([videoPlayer, widgets.HBox([play, slider])])

# display the widget box
display(widget_box)



In [None]:
annotatedVideo


In [None]:
import cv2
import numpy as np
import ipywidgets as wd
from IPython.display import display, clear_output

# Read the video (replace 'myvideo.mp4' with your video file)
video = cv2.VideoCapture(VIDEO_FILE2)


# Initialize the widget for displaying the video frame
image_widget = wd.Image(format='jpeg')

# Function to update the displayed frame
def update_frame(change):
    frame_index = change.new
    video.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
    _, frame = video.read()
    image_widget.value = cv2.imencode('.jpg', frame)[1].tobytes()

# Create an IntSlider for frame selection
frame_slider = wd.IntSlider(min=0, max=int(video.get(cv2.CAP_PROP_FRAME_COUNT)) - 1, value=0)
frame_slider.observe(update_frame, names='value')

# Display the initial frame
update_frame(wd.observe(frame_slider, 'value'))

# Show the slider and video frame
display(wd.VBox([frame_slider, image_widget]))
