# 09 Classify videos by joke type and visualise with FiftyOne

Each video demonstrates a single joke type. We have this as meta-data. Can we train a classifier based on the movement data?

Each video shows one joke from a set of five possibilities [Peekaboo,TearingPaper,NomNomNom,ThatsNotAHat,ThatsNotACat].

We will use TensorFlow to train a classifier to predict the joke type from the movement data.
In order to see what classifier is doing, we will use [FiftyOne]() to visualise the data and predictions

In [None]:
import os
import fiftyone as fo
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=4, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

# local imports
import utils
import display
import calcs

## 9.1 Load the data

### Use either small demo 
Consists of 54 videos. From 4 families (parent and baby) demoing five jokes three times each. (Some missing)

In [2]:
videos_in = os.path.join("..","LookitLaughter.test")
demo_data = os.path.join("..","data", "demo")
temp_out = os.path.join("..","data","0_temp")
data_out = os.path.join("..","data","1_interim")
videos_out = os.path.join("..","data","2_final")

### Or the Full set

Consists of 1425 videos. From 90 familes (parent and baby) demoing approximately five jokes three times each. Some repetitions and omissions.

In [3]:
videos_in = os.path.join("..","..","LookitLaughter.full.videos")
temp_out = os.path.join("..","..","LookitLaughter.full.data","0_temp")
data_out = os.path.join("..","..","LookitLaughter.full.data","1_interim")
videos_out = os.path.join("..","..","LookitLaughter.full.data","2_final")


In [None]:
processedVideos = utils.getProcessedVideos(data_out)
minFrames = processedVideos['Frames'].min()
maxFrames = processedVideos['Frames'].max()
print(f"We have {len(processedVideos)} processed videos.")
print(f"Min Frames: {minFrames}\nMax Frames: {maxFrames}")
processedVideos.head()

In [None]:
# use matplotlib to draw histograam of number of frames

import matplotlib.pyplot as plt

plt.hist(processedVideos['Frames'], bins=40)

# 9.2 Use Voxel51 and PytorchVideo for examining videos

Voxel51 seems to be a useful tool for looking at training data (and trained predictions).

Let's start with the minimal implementation. Just viewing videos.

https://docs.voxel51.com/user_guide/dataset_creation/index.html



### 9.2.1 Is dataset already created?

FiftyOne may aleady have a dataset created. Let's check. And reload it. 

In [None]:
datasets = fo.list_datasets()
if len(datasets) == 0:
    print("No datasets found. Load in step 7.1.2")
else:
    print("Loading saved datasets: ", datasets[0])
    dataset = fo.load_dataset(datasets[0])

### 7.1.2 Populate a FiftyOne dataset with our videos and labels.

Either there is no existing dataset or we want to rebuild it.

In [7]:
# optional: delete datasets
fo.delete_datasets("*")

In [None]:
# Create a dataset from a directory of videos
dataset = fo.Dataset.from_videos_dir(videos_in)
dataset.ensure_frames()

dataset.name = 'LookitLaughter.full'

dataset.add_sample_field("JokeType", fo.StringField, description="What joke is being told?")
dataset.add_sample_field("HowFunny", fo.StringField, description="How funny is the joke?")
dataset.add_sample_field("LaughYesNo",  fo.BooleanField, description="Did the child laugh?")

Now let's see if we can add our metadata classifications. Recalling that each video demos one joke type `[Peekaboo,TearingPaper,NomNomNom,ThatsNotAHat,ThatsNotACat]` and has rating of how funny the baby found it `[Not Funny, Slightly Funny, Funny, Extremely Funny]` and whether they laughed `[Yes, No]`.


In [None]:
# add the joke type, how funny and laugh yes/no for each sample in the dataset
for sample in dataset:
    #split the filepath to get the video name, system independent
    videoname = os.path.basename(sample.filepath)
    row = processedVideos[processedVideos["VideoID"]==videoname]
    if len(row) == 0:
        print(f"Video {videoname} not found in processed videos.")
        continue
    sample["VideoID"]  = row["VideoID"].values[0]
    sample["JokeType"]  = row["JokeType"].values[0]
    sample["HowFunny"]  = row["HowFunny"].values[0]
    sample["LaughYesNo"]  = (row["LaughYesNo"].values[0] == "Yes")
#    sample["JokeType"]  = fo.Classification(label = row["JokeType"].values[0])
#    sample["HowFunny"]  = fo.Classification(label = row["HowFunny"].values[0])
#    sample["LaughYesNo"]  = fo.Classification(label = (row["LaughYesNo"].values[0] == "Yes"))
    
    sample.save()

Let's add the frame by frame annotations directly onto the videos inside fiftyone

In [None]:
#Let's start with people bounding boxes

for sample in dataset:
    #retrieve people bounding boxes from the keypoints file
    try:
        keypoints = utils.readKeyPointsFromCSV(processedVideos,sample.filepath,normed= True)    
    except FileNotFoundError as e:
        print(f"Error reading keypoints for {sample.filepath}")
        continue
    
    for index, row in keypoints.iterrows():
        framenumber = row["frame"] + 1
        person = row["person"]
        bbox = [row["bbox.x1"], row["bbox.y1"], row["bbox.x2"], row["bbox.y2"]]
        bbox51 = calcs.xyxy2ltwh(bbox)
        if sample.frames[framenumber]:
            frame = sample.frames[framenumber]
        else:
            frame = fo.Frame()
        frame[person] = fo.Detection(label=person, bounding_box=bbox51)
        sample.frames[framenumber] = frame
        #TODO fiftyone add keypoints not well documented and i can't get it to work. 
        #frame[person + "KeyPoints"] =  
        #fo.KeypointSkeleton()
        #TODO can't see how to add timesynced captions either!

        sample.save()
        
dataset.save()

In [None]:
session = fo.launch_app(dataset)
# in docker launch fiftiy needs port
# session = fo.launch_app(dataset, address="0.0.0.0", port=5151)

## 9.3 Load and preprocess the data

1. Load normed movement data. 
2. Exclude videos shorter than a min length
3. Pad all sequences to the same max length. 
4. Interpolate missing values (up to last frame of real data).
5. Replace final missing values with zeros.
6. Add to tf.data.Dataset.

In [None]:
def createMovementDataset(processedVideos, minFrames = 0, maxFrames = None, ragged = False):
    """
    Creates a movement dataset from processed videos.

    Args:
        processedVideos (pandas.DataFrame): A DataFrame containing processed video data.
        minFrames (int, optional): Rows with less than minframes are excluded. Defaults to 0 (include all frames.)
        maxFrames (int, optional): Data padded or truncated to have maxFrames. Defaults to max of all videos.
        ragged (bool, optional): Whether to create a ragged tensor. Defaults to False (TODO: not implemented yet)

    Returns:
        tf.data.Dataset: A TensorFlow Dataset containing features and labels.
    """
    if maxFrames is None:
        maxFrames = processedVideos["Frames"].max()
    if ragged:
        raise NotImplementedError("Ragged tensors not implemented yet.")    
    dataset = []
    labels = []
    # for each row of processedVideos, we add one timeseries to the dataset
    for index, r in processedVideos.iterrows():
        df = pd.read_csv(r['Keypoints.normed'])
        if r["Frames"] < minFrames:
            continue
        df = utils.padMovementData(df, maxFrames)
        df = utils.interpolateMovementData(df)
        df = df.replace(np.nan, 0)
        df = utils.flattenMovementDataset(df)
        
        features = tf.convert_to_tensor(df.values, dtype=tf.float32)
        label = r["Joke.Label"]
        dataset.append(features)
        labels.append(label)
    
    return tf.data.Dataset.from_tensor_slices((dataset, labels))

In [None]:
trainMinFrames = 100
trainMaxFrames = 1000   
tfdataset = createMovementDataset(processedVideos,trainMinFrames,trainMaxFrames)

train, test = tf.keras.utils.split_dataset(tfdataset, left_size=0.8)

In [None]:
#get first element of dataset so we can grab its dimensions
keyPoints = next(iter(train))[0]

#let's build a simple model
model = tf.keras.Sequential([
    layers.Input(shape=(keyPoints.shape[0], keyPoints.shape[1])),
    layers.LSTM(8),
    layers.Dense(1)
])

# Compile the model
model.compile(loss=tf.losses.MeanSquaredError(),
              sample_weight_mode='temporal',
              optimizer=tf.optimizers.Adam(),
              metrics=[tf.metrics.MeanAbsoluteError()])

# Train the model
model.fit(train.batch(32), epochs=20)


In [None]:
#let's evaluate the model
model.evaluate(test.batch(32))

#table of predictions
predictions = model.predict(test.batch(32))

In [None]:
predictions

In [None]:
labels