First we import all the libraries we need.

In [14]:
#!c1.32
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from IPython.display import Video
import cv2
import math
import torch
import glob
from tqdm.auto import tqdm
from multiprocessing import Pool, cpu_count
import time
import argparse
import logging
from sys import exit
import shutil

from timm.models import create_model, apply_test_time_pool
from timm.data import ImageDataset, create_loader, resolve_data_config
from timm.utils import AverageMeter, setup_default_logging

We do some setting up here. First we define `debug`. If `debug` is set then that means that we do not actually want to train the model we're just checking if everything is working. That means small number of epochs and small amount of videos to train on.

We will try different error tolrances for each type of event and tune it as a hyper parameter. We basically instead of giving the model a second where an event happens we give it a range of [event_timestamp - err_tol, event_timestamp + err_tol] and train it on that. 

Since the dataset is very small, we define the train/validation split manually. Once everything is ready we will try different combinations manually and keep the one that gives the best result.

In [41]:
#!c1.32
debug = False
if debug:
    epochs = 3
else:
    epochs = 20

err_tol = {
    'challenge': [ 0.30, 0.40, 0.50, 0.60, 0.70 ],
    'play': [ 0.15, 0.20, 0.25, 0.30, 0.35 ],
    'throwin': [ 0.15, 0.20, 0.25, 0.30, 0.35 ]
}
video_id_split = {
    'val':[
         '3c993bd2_0',
         '3c993bd2_1',
    ],
    'train':[
         '1606b0e6_0',
         '1606b0e6_1',
         '35bd9041_0',
         '35bd9041_1',
         '407c5a9e_1',
         '4ffd5986_0',
         '9a97dae4_1',
         'cfbe2e94_0',
         'cfbe2e94_1',
         'ecf251d4_0',
    ]
}
event_names = ['challenge', 'throwin', 'play']

We then load the data and remove every value of the form [id, timestamp, event_type, attr] (except for types `start` and `end`) and add two values in its place [id, timestamp - err_tol, start_event_type, attr] and [id, timestamp + err_tol, end_event_type, attr].

In [19]:
#!c1.32
df = pd.read_csv("../dfl-bundesliga-data-shootout/train.csv")
additional_events = []
for arr in df.sort_values(['video_id','time','event','event_attributes']).values:
    # if we encounter an event that isn't start or end
    if arr[2] in err_tol:
        # take half of the tolarince
        tol = err_tol[arr[2]][0]/2
        # add starting timestamp of the event
        additional_events.append([arr[0], arr[1]-tol, 'start_'+arr[2], arr[3]])
        # add ending timestamp of the event
        additional_events.append([arr[0], arr[1]+tol, 'end_'+arr[2], arr[3]])
df = pd.concat([df, pd.DataFrame(additional_events, columns=df.columns)])
# take all the events that aren't contained in event_names
# this effectively deletes all challenge, pass, throwin events
# and leaves only start, end, and start_event, and end_event
df = df[~df['event'].isin(event_names)]
df = df.sort_values(['video_id', 'time'])
df

Unnamed: 0,video_id,time,event,event_attributes
0,1606b0e6_0,200.265822,start,
0,1606b0e6_0,201.000000,start_challenge,['ball_action_forced']
1,1606b0e6_0,201.300000,end_challenge,['ball_action_forced']
2,1606b0e6_0,202.765822,end,
3,1606b0e6_0,210.124111,start,
...,...,...,...,...
11214,ecf251d4_0,3058.072895,end,
11215,ecf251d4_0,3068.280519,start,
8762,ecf251d4_0,3069.472000,start_throwin,['pass']
8763,ecf251d4_0,3069.622000,end_throwin,['pass']


In the next cell we go over every video and split it into photos. We assign 4 different kinds of photos. If a certain frame falls between the start and end of a certain event then the photo of that frame is assigned to that type. If a certain frame doesn't fall in any event then we assign it to type `background` which means no event is happening in this frame. 

Also, for instead of extracting the i-th frame as is, we stack three grayscaled frames (i-1)-th, i-th, (i+1)-th and consider them as the i-th frame.

In [42]:
#!c1.32
def extract_training_images(args):
        video_id, split = args
        video_path = f"../dfl-bundesliga-data-shootout/train/{video_id}.mp4"
        # create video object
        cap = cv2.VideoCapture(video_path)
        # get number of frames per second
        fps = cap.get(cv2.CAP_PROP_FPS)

        # this function returns a gray scaled frame given the index of the frame
        def get_frame(frame_num):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
            ret, frame = cap.read()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            return frame
        
        time_interval = 1/fps

        # grab only relevent rows from train.csv
        df_video = df[df.video_id == video_id]
        if debug:
            df_video = df_video.head(10)
        print(split, video_id, df_video.shape)

        arr = df_video[['time','event']].values
        # iterate over all events
        for idx in range(len(arr)-1):
            # the frame where this event starts
            crr_frame = int(math.ceil(arr[idx,0] * fps))
            # the frame where this event ends
            nxt_frame = int(math.ceil(arr[idx+1,0] * fps))
            crr_event = arr[idx,1]

            # get which type of event this frame is in
            crr_event = crr_event
            if crr_event == 'start':
                crr_status = 'background'
            elif crr_event == 'end':
                # should use as background?
                continue
            else:
                start_or_end, crr_status = crr_event.split('_', 1)
                if start_or_end == 'end':
                    crr_status = 'background'

            # create result directory
            result_dir = f"../work/split_images/{split}/{crr_status}"
            if not os.path.exists(result_dir):
                os.makedirs(result_dir, exist_ok=True)

            # iterate over all the frames in this event
            this_frame = crr_frame
            while this_frame < nxt_frame:
                frame_num = this_frame
                
                # get three consecutive frames and stack them channel-wise
                frame_prev = get_frame(frame_num - 1)
                frame_cur = get_frame(frame_num)
                frame_next = get_frame(frame_num + 1)
                frame = np.stack((frame_prev, frame_cur, frame_next), axis=-1)
                
                out_file = f'{result_dir}/{video_id}_{frame_num:06}.jpg'
                cv2.imwrite(out_file, frame)

                if crr_status == 'background':
                    this_frame += 10
                else:
                    this_frame += 1

for split in video_id_split:
   video_ids = video_id_split[split]
   for video_id in video_ids:            
       extract_training_images([video_id, split])
print('done')

train 1606b0e6_0 (1396, 4)
train 1606b0e6_1 (1756, 4)
train 35bd9041_0 (1486, 4)
train 35bd9041_1 (1292, 4)
train 407c5a9e_1 (1208, 4)
train 4ffd5986_0 (1094, 4)
train 9a97dae4_1 (1028, 4)
train cfbe2e94_0 (1128, 4)
train cfbe2e94_1 (1048, 4)
train ecf251d4_0 (1366, 4)
done


In [34]:
#!c1.32
for split in video_id_split:
   video_ids = video_id_split[split]
   for video_id in video_ids:            
       print([video_id, split])

['3c993bd2_0', 'val']
['3c993bd2_1', 'val']
['1606b0e6_0', 'train']
['1606b0e6_1', 'train']
['35bd9041_0', 'train']
['35bd9041_1', 'train']
['407c5a9e_1', 'train']
['4ffd5986_0', 'train']
['9a97dae4_1', 'train']
['cfbe2e94_0', 'train']
['cfbe2e94_1', 'train']
['ecf251d4_0', 'train']


Training was causing memory problems, we used the following line to elevate the problem.

In [43]:
#!c1.32
torch.cuda.empty_cache()

We use the pretrianed tf_efficientnet_b0_ap model from the timm library and trained it on our images using the train script from the timm library.

In [None]:
#!g1.4
%run ../image_models/train.py work/split_images/ \
    -b 16 \
    --input-size 3 720 1280 \
    --img-size 32 \
    --amp \
    --epochs 20 \
    --pretrained \
    --num-classes 4 \
    --model tf_efficientnet_b0_ap \
    --experiment dfl-benchmark-training-fix-extract-images \
    --bce-loss \
    --cooldown-epochs 0 \
    --drop 0.2 \
    --mixup 0.2 \
    --color-jitter 0.6


Training with a single process on 1 GPUs.
Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth)
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth" to /tmp/xdg_cache/torch/hub/checkpoints/tf_efficientnet_b0_ap-f262efe1.pth
Model tf_efficientnet_b0_ap created, param count:4012672
Data processing configuration for current model + dataset:
	input_size: (3, 720, 1280)
	interpolation: bicubic
	mean: (0.5, 0.5, 0.5)
	std: (0.5, 0.5, 0.5)
	crop_pct: 0.875
Using native Torch AMP. Training in mixed precision.
Scheduled epochs: 20
Test: [   0/2388]  Time: 1.496 (1.496)  Loss:  0.2366 (0.2366)  Acc@1: 100.0000 (100.0000)  Acc@5: 100.0000 (100.0000)
Test: [  50/2388]  Time: 0.088 (0.303)  Loss:  0.1694 (0.1647)  Acc@1: 100.0000 (100.0000)  Acc@5: 100.0000 (100.0000)
Test: [ 100/2388]  Time: 0.492 (0.294)  Loss:  0.1604 (0

We take the training checkpoints and average the weights from the last few ones using the script provided by timm to save it as a model so we don't have to train it every time.

In [None]:
#!c1.32
%run ../image_models/avg_checkpoints.py --input output/train/dfl-benchmark-training-fix-extract-images \
    --output ../model/tf_efficientnet_b0_ap-456-fix.pt

=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-10.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-11.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-12.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-13.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-14.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-15.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-16.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-training-fix-extract-images/checkpoint-17.pth.tar'
=> Extracting metric from checkpoint 'output/train/dfl-benchmark-trainin