# Video Annotator
## Video understanding demo

**Note**:
- read the `data` section of the README first.
- this notebook assumes that you have already
    - downloaded data
    - placed it in the directory expected by `videoannotator/config.py`

In [18]:
import json
import numpy as np
import pandas as pd
from IPython.display import HTML

from videoannotator import data, models, io, config as cfg

## helpers

In [19]:
def get_model(label: str) -> models.LogisticRegression:
    """
    Trains a model given all the aggregated annotations for `label`.
    """
    ds = data.get_aggregate_labeled_dataset(label=label)
    clf = models.LogisticRegression(scoring='average_precision')
    clf.fit(ds.x, ds.y)
    return clf

def get_score(label: str, key: str) -> float:
    """
    Get classification score for `label` against shot `key`.
    """
    if label not in clfs:
        clfs[label] = get_model(label=label)
    emb = data.get_embeddings_dict()[key]
    return clfs[label].predict_proba(emb[None, :]).item()

def get_score_all_labels(key: str) -> dict:
    """
    Scores all labels against the shot represented by `key`.
    """
    return {label: get_score(label=label, key=key) for label in cfg.LABELS}

def get_color(val: float) -> str:
    """Used for table cell formatting."""
    if val > 0.5:
        r, g = 0, 128
        a = val
    else:
        r, g = 255, 0
        a = 1 - val
    return f'color: rgba({r}, {g}, 0, {a})'

## load a few models

In [40]:
labels = ('action', 'establishing-shots', 'day', 'car-chase')
clfs = {label: get_model(label=label) for label in labels}

In [21]:
# the full set of available labels is at cfg.LABELS
', '.join(sorted(cfg.LABELS))

'action, aerial, alcohol, anger, animal, car-chase, character-focus, closeup, cowboy-shot, day, drama, drugs, dutch-angle, establishing-shots, extreme-close-up, extreme-wide-shot, eye-level, fantasy, fight, golden-hour, gore, group-shot, handheld, happy, high-angle, horror, insert-shot, interior, interview, intimacy, jump-scare, laughter, low-angle, medium, nudity, object, over-the-shoulder-shot, overhead-shot, pan, point-of-view-shot, romance, run, sad, scared, sci-fi, shutter-shot, single-shot, slow-motion, smoking, static-shot, tilt-shot, timelapse, two-shot, violence, wide, zoom'

## load a few sample embeddings

In [22]:
samples = json.load(open('sample-embeddings.json'))
samples = {k: np.array(v) for k, v in samples.items()}

## score the sample embeddings

In [41]:
def _get_record(title, emb):
    d = dict(
        title=title,
        video_thumbnail=f'<img src="images/{pngs[title]}">',
    )
    scores = {
        f'{label}_score': clf.predict_proba(emb[None, :]).item()
        for label, clf in clfs.items()
    }
    return {**d, **scores}

In [42]:
pngs = {'Operation Varsity Blues': 'varsity.png', '6 Underground': '6.png', 'Leave the world behind': 'leave.png'}
df = pd.DataFrame(
    _get_record(title, emb)
    for title, emb in samples.items()
).set_index(['title', 'video_thumbnail'])        
df = df.style.applymap(get_color).format(lambda x: f'{x:.2f}')
HTML(df.to_html(escape=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,action_score,establishing-shots_score,day_score,car-chase_score
title,video_thumbnail,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Operation Varsity Blues,,0.18,0.99,0.84,0.01
6 Underground,,0.73,0.3,0.69,0.78
Leave the world behind,,0.36,0.12,0.11,0.04


## bring your own C4C embeddings

Use [Clip4CLIP](https://github.com/ArrowLuo/CLIP4Clip) to extract video embeddings from your own videos and then apply.

In [25]:
np.random.seed(0)
your_emb = np.random.rand(1, 512)  # replace with your own

In [26]:
label = 'car-chase'  # pick from any model in `cfg.LABELS`
get_model(label=label).predict_proba(your_emb).item()

0.10548243605868816

## apply any model to any shot in the corpus from the Condensed Movie Dataset
More details at [this link](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/)

In [27]:
df_shots = pd.read_csv(io.PATHS_STATIC.shot_data).set_index('key')
df_shots.head()

Unnamed: 0_level_0,yt_id,desc,genre,frame_in,frame_out,start,end,fps,cluster_index
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HnnBvemHrWA|0,HnnBvemHrWA,Frankie and Johnny - He Just Asked Her Out,Comedy,0,82,0.0,3.420311,23.974427,8
gOJJm_cSRds|3987,gOJJm_cSRds,The Wild Bunch - Bank Shootout,Action,3987,4007,166.302203,167.136425,23.974427,6
mDViU8OSRkA|2222,mDViU8OSRkA,Non-Stop - How's Your Daughter? Scene,,2222,2284,92.675917,95.261833,23.976024,3
egB-SG97EcI|805,egB-SG97EcI,Kalifornia - I Think I Gotta Kill You Scene,Crime,805,825,33.57519,34.409357,23.976037,0
uGsWYV2bWAc|1013,uGsWYV2bWAc,Kill Bill: Vol. 1 - The Bride vs. Gogo,Action,1013,1041,42.250542,43.418375,23.976024,7


### pick a key from the dataframe above

In [28]:
key = 'HnnBvemHrWA|0'

In [29]:
get_score(label='action', key=key)

0.15079263042877716

In [30]:
get_score(label='closeup', key=key)

0.8293545496789174

In [31]:
get_score(label='establishing-shots', key=key)

0.02759443746631655

## apply all models to a shot in the corpus

In [32]:
def get_link(key: str) -> HTML:
    rec = df_shots.loc[key]
    link = f'https://youtu.be/{rec.yt_id}?t={int(rec.start)}'
    return HTML(f'<a target="_blank" rel="noopener noreferrer" href="{link}">click here to watch</a>')

In [33]:
# sort by score descending
pd.Series(get_score_all_labels(key=key)).sort_values(ascending=False).round(2)

eye-level                 0.89
interior                  0.83
closeup                   0.83
character-focus           0.75
single-shot               0.65
static-shot               0.55
medium                    0.44
fantasy                   0.39
object                    0.39
low-angle                 0.37
drama                     0.34
violence                  0.33
horror                    0.31
over-the-shoulder-shot    0.31
day                       0.27
scared                    0.22
smoking                   0.22
anger                     0.20
happy                     0.17
handheld                  0.16
animal                    0.15
action                    0.15
sad                       0.12
tilt-shot                 0.11
pan                       0.10
insert-shot               0.09
cowboy-shot               0.08
alcohol                   0.07
zoom                      0.07
intimacy                  0.06
dutch-angle               0.06
slow-motion               0.05
run     

In [34]:
# get the link to watch
get_link(key=key)