## Libraries & Dependencies

In [1]:
import os
import json
from pathlib import Path
import subprocess
import whisper # pip install openai-whisper
import sys
import torch 

### Step 1
Find all unique labels from game annotations

In [None]:
data_root = Path(os.getcwd()) / "data" / "raw"
unique_labels = set()

for game_dir in data_root.iterdir():
    if not game_dir.is_dir():
        continue  # skip zips or non-folders

    json_path = game_dir / "Labels-v2.json"
    if not json_path.exists():
        continue # skip if JSON file does not exist

    with json_path.open() as f:
        payload = json.load(f)

    for ann in payload.get("annotations", []):
        label = ann.get("label")
        if label:
            unique_labels.add(label)

print(f"Found {len(unique_labels)} unique labels")
print("-" * 22)
for label in sorted(unique_labels):
    print(label)


Found 17 unique labels
----------------------
Ball out of play
Clearance
Corner
Direct free-kick
Foul
Goal
Indirect free-kick
Kick-off
Offside
Penalty
Red card
Shots off target
Shots on target
Substitution
Throw-in
Yellow card
Yellow->red card


### Step 2
Manually pick highlight vs. non-highlight labels fpr binary classification

In [3]:
highlight_labels = sorted(
    [
        "Goal", 
        "Penalty", 
        "Shots on target", 
        "Shots off target", 
        "Direct free-kick",
        "Yellow->red card", 
        "Red card", 
    ]
)

non_highlight_labels = sorted(list(unique_labels - set(highlight_labels)))

print("\nHighlight Labels:")
print("-" * 17)
for label in highlight_labels:
    print(label)

print("\nNon-Highlight Labels:")
print("-" * 21)
for label in non_highlight_labels:
    print(label)


Highlight Labels:
-----------------
Direct free-kick
Goal
Penalty
Red card
Shots off target
Shots on target
Yellow->red card

Non-Highlight Labels:
---------------------
Ball out of play
Clearance
Corner
Foul
Indirect free-kick
Kick-off
Offside
Substitution
Throw-in
Yellow card


## Step 3: 
Load the labels and split events by each half

In [47]:
def load_events(label_json_path):
    data = json.load(open(label_json_path))
    events = data["annotations"]

    # Split by half based on "gameTime": "1 - mm:ss" or "2 - mm:ss"
    half1 = [e for e in events if e["gameTime"].startswith("1 -")]
    half2 = [e for e in events if e["gameTime"].startswith("2 -")]

    # Convert "position" to int milliseconds
    for e in half1:
        e["position"] = int(e["position"])
    for e in half2:
        e["position"] = int(e["position"])
    
    return half1, half2

In [48]:
# example usage
game_dir_name = "2016-02-07 - 19-00 Chelsea 1 - 1 Manchester United" # replace with any game directory name
game_dir = f"data/raw/{game_dir_name}"
label_json_path = f"{game_dir}/Labels-v2.json"

half1_events, half2_events = load_events(label_json_path)
print("Number of events in 1st half:", len(half1_events))
print("Number of events in 2nd half:", len(half2_events))

Number of events in 1st half: 106
Number of events in 2nd half: 130


In [31]:
half1_events[:5]

[{'gameTime': '1 - 00:00',
  'label': 'Kick-off',
  'position': 32,
  'team': 'home',
  'visibility': 'not shown'},
 {'gameTime': '1 - 01:12',
  'label': 'Ball out of play',
  'position': 72033,
  'team': 'not applicable',
  'visibility': 'visible'},
 {'gameTime': '1 - 01:16',
  'label': 'Throw-in',
  'position': 76339,
  'team': 'away',
  'visibility': 'visible'},
 {'gameTime': '1 - 02:07',
  'label': 'Ball out of play',
  'position': 127454,
  'team': 'not applicable',
  'visibility': 'visible'},
 {'gameTime': '1 - 02:18',
  'label': 'Throw-in',
  'position': 138486,
  'team': 'away',
  'visibility': 'visible'}]

## Step 4: 
Filter to highlight events only

In [32]:
def filter_highlight_events(events, highlight_labels):
    return [
        e for e in events 
        if e["label"] in highlight_labels
    ]

half1_h_events = filter_highlight_events(half1_events, highlight_labels)
half2_h_events = filter_highlight_events(half2_events, highlight_labels)

print("Number of highlight events in 1st half:", len(half1_h_events))
print("Number of highlight events in 2nd half:", len(half2_h_events))

Number of highlight events in 1st half: 7
Number of highlight events in 2nd half: 20


In [33]:
half1_h_events[:5]

[{'gameTime': '1 - 12:33',
  'label': 'Shots on target',
  'position': 753206,
  'team': 'away',
  'visibility': 'visible'},
 {'gameTime': '1 - 23:05',
  'label': 'Shots off target',
  'position': 1385603,
  'team': 'away',
  'visibility': 'visible'},
 {'gameTime': '1 - 25:35',
  'label': 'Shots off target',
  'position': 1535540,
  'team': 'home',
  'visibility': 'visible'},
 {'gameTime': '1 - 31:00',
  'label': 'Shots off target',
  'position': 1860822,
  'team': 'home',
  'visibility': 'visible'},
 {'gameTime': '1 - 34:00',
  'label': 'Shots off target',
  'position': 2040531,
  'team': 'home',
  'visibility': 'visible'}]

### Step 5:
Build clip metadata per half

In [34]:
def label_clip(start_ms, end_ms, highlight_events):
    """
    Label = 1 if any highlight event timestamp lies inside [start_ms, end_ms)
    """
    for e in highlight_events:
        if start_ms <= e["position"] < end_ms:
            return 1
    return 0

In [35]:
CLIP_LEN_MS = 8000 # 8s = 8000 ms

def build_clips_for_half(game_dir, half_id, highlight_events, half_duration_ms):
    clips = []
    idx = 0

    for start in range(0, half_duration_ms, CLIP_LEN_MS):
        end = start + CLIP_LEN_MS
        label = label_clip(start, end, highlight_events)

        entry = {
            "clip_id": f"{Path(game_dir).name}_half{half_id}_clip{idx:04d}",
            "half": half_id,
            "start_ms": start,
            "end_ms": end,
            "highlight": label,

            # where files will go:
            "full_clip_path":  f"{game_dir}/processed/half{half_id}/clip_{idx:04d}_full.mkv",
            "video_only_path": f"{game_dir}/processed/half{half_id}/clip_{idx:04d}_video.mkv",
            "audio_path":      f"{game_dir}/processed/half{half_id}/clip_{idx:04d}_audio.wav",
            "transcript_path": f"{game_dir}/processed/half{half_id}/clip_{idx:04d}_speech.txt",
        }

        clips.append(entry)
        idx += 1

    return clips

In [36]:
def get_video_duration_ms(video_path):
    cmd = [
        "ffprobe",
        "-v", "quiet",
        "-print_format", "json",
        "-show_format",
        video_path
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    info = json.loads(result.stdout)
    duration_sec = float(info["format"]["duration"])
    return int(duration_sec * 1000)

In [37]:
half1_path = f"{game_dir}/1_224p.mkv"
half2_path = f"{game_dir}/2_224p.mkv"

half1_duration_ms = get_video_duration_ms(half1_path)
half2_duration_ms = get_video_duration_ms(half2_path)

half1_duration_ms, half2_duration_ms

(2700003, 3020003)

In [38]:
half1_clips = build_clips_for_half(
    game_dir, half_id=1,
    highlight_events=half1_h_events,
    half_duration_ms=half1_duration_ms
)

half2_clips = build_clips_for_half(
    game_dir, half_id=2,
    highlight_events=half2_h_events,
    half_duration_ms=half2_duration_ms
)

len(half1_clips), len(half2_clips)

(338, 378)

### Step 6:
Store labeled clip metadata into json}

In [39]:
output_dir = f"{game_dir}/processed"
os.makedirs(output_dir, exist_ok=True)

with open(f"{output_dir}/clips.json", "w") as f:
    json.dump(half1_clips + half2_clips, f, indent=4)

print("Saved:", f"{output_dir}/clips.json")

Saved: data/raw/2016-02-07 - 19-00 Chelsea 1 - 1 Manchester United/processed/clips.json


### Step 7:
Extract the clips using the metadata created

In [40]:
import subprocess

def run_ffmpeg(cmd):
    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if res.returncode != 0:
        raise RuntimeError(f"ffmpeg failed:\n{res.stderr}")
    return res

def cut_full_clip(input_video, start_ms, out_path):
    start = start_ms / 1000
    dur = CLIP_LEN_MS / 1000
    cmd = [
        "ffmpeg", "-v", "error", "-y",
        "-ss", str(start),
        "-i", input_video,
        "-t", str(dur),
        "-c:v", "copy",  # avoid libx264
        "-c:a", "copy",  # avoid aac re-encode
        out_path,
    ]
    run_ffmpeg(cmd)

def cut_video_only(input_video, start_ms, out_path):
    start = start_ms / 1000
    dur = CLIP_LEN_MS / 1000
    cmd = [
        "ffmpeg", "-v", "error", "-y",
        "-ss", str(start),
        "-i", input_video,
        "-t", str(dur),
        "-c:v", "copy",  # avoid libx264
        "-an",
        out_path,
    ]
    run_ffmpeg(cmd)

def cut_audio_only(input_video, start_ms, out_path):
    start = start_ms / 1000
    dur = CLIP_LEN_MS / 1000
    cmd = [
        "ffmpeg",
        "-ss", str(start),
        "-i", input_video,
        "-t", str(dur),
        "-vn",
        "-ac", "1",
        "-ar", "16000",
        "-y", out_path
    ]
    run_ffmpeg(cmd)

In [41]:
with open(f"{game_dir}/processed/clips.json") as f:
    clips = json.load(f)

for clip in clips:
    half = clip["half"]
    start_ms = clip["start_ms"]

    input_mkv = f"{game_dir}/{half}_224p.mkv"
    
    outdir = f"{game_dir}/processed/half{half}"
    os.makedirs(outdir, exist_ok=True)

    # full clip
    cut_full_clip(input_mkv, start_ms, clip["full_clip_path"])

    # video-only
    cut_video_only(input_mkv, start_ms, clip["video_only_path"])

    # audio-only
    cut_audio_only(input_mkv, start_ms, clip["audio_path"])

    # empty transcript placeholder
    with open(clip["transcript_path"], "w") as t:
        t.write("")  # placeholder for future ASR

print("All clips generated.")


All clips generated.


In [42]:
print('Python executable:', sys.executable)
print('PyTorch version:', torch.__version__)
print('Torch CUDA available:', torch.cuda.is_available())
print('Torch CUDA version (built with):', torch.version.cuda)

try:
    print('CUDA device count:', torch.cuda.device_count())
    if torch.cuda.device_count() > 0:
        print('Current CUDA device name:', torch.cuda.get_device_name(0))
except Exception as e:
    print('Could not query CUDA device name:', e)

Python executable: c:\Users\golde\Documents\UMD\CMSC498K - Multimodal Deep Learning\.venv\Scripts\python.exe
PyTorch version: 2.8.0+cu126
Torch CUDA available: True
Torch CUDA version (built with): 12.6
CUDA device count: 1
Current CUDA device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [None]:
asr_model = whisper.load_model("medium", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

100%|█████████████████████████████████████| 1.42G/1.42G [00:20<00:00, 76.0MiB/s]


In [50]:
asr_model_small = whisper.load_model("small", device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) 

In [53]:
"""Run Whisper ASR on an audio file and return the text."""
def transcribe_audio(model, audio_path):
    print("\nRunning Whisper ASR...")
    result = model.transcribe(audio_path, verbose=True)
    transcript = result["text"].strip()

    print("\nTRANSCRIPT:")
    print(transcript if transcript else "[EMPTY TRANSCRIPT]")
    return transcript

In [None]:
metadata_path = f"{game_dir}/processed/clips.json"
clips = json.load(open(metadata_path))

for clip in clips:
    audio_path = clip["audio_path"]
    text_path  = clip["transcript_path"]
    
    # skip if audio missing
    if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
        print("Missing audio:", audio_path)
        continue
    
    # run ASR
    transcript = transcribe_audio(asr_model, audio_path)
    
    # write transcript
    with open(text_path, "w", encoding="utf-8") as f:
        f.write(transcript)

print("ASR transcription complete!")

ASR transcription complete!


### Step 8:
Run the extraction and labling process for the remaining games

In [49]:
data_raw_path = "data/raw"
folder_names = [name for name in os.listdir(data_raw_path) if os.path.isdir(os.path.join(data_raw_path, name))]
print(folder_names)

['2016-02-07 - 19-00 Chelsea 1 - 1 Manchester United', '2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd', '2016-02-14 - 19-15 Manchester City 1 - 2 Tottenham', '2016-02-27 - 18-00 Southampton 1 - 2 Chelsea', '2016-03-01 - 22-45 Norwich 1 - 2 Chelsea', '2016-03-02 - 23-00 Liverpool 3 - 0 Manchester City', '2016-03-05 - 18-00 Chelsea 1 - 1 Stoke City', '2016-03-05 - 18-00 Manchester City 4 - 0 Aston Villa', '2016-03-19 - 18-00 Chelsea 2 - 2 West Ham', '2016-03-20 - 19-00 Manchester City 0 - 1 Manchester United', '2016-04-09 - 17-00 Swansea 1 - 0 Chelsea', '2016-04-09 - 19-30 Manchester City 2 - 1 West Brom', '2016-04-23 - 17-00 Bournemouth 1 - 4 Chelsea', '2016-05-07 - 17-00 Sunderland 3 - 2 Chelsea']


In [54]:
for folder in folder_names[1:]:  # skip first since we tested it already
    """
    get events for each half of the gamee
    """
    game_dir = f"data/raw/{folder}"
    label_json_path = f"{game_dir}/Labels-v2.json"

    half1_events, half2_events = load_events(label_json_path)

    """
    extract highlight events for each half
    """
    half1_h_events = filter_highlight_events(half1_events, highlight_labels)
    half2_h_events = filter_highlight_events(half2_events, highlight_labels)

    """
    get half durations
    """
    half1_path = f"{game_dir}/1_224p.mkv"
    half2_path = f"{game_dir}/2_224p.mkv"

    half1_duration_ms = get_video_duration_ms(half1_path)
    half2_duration_ms = get_video_duration_ms(half2_path)

    """
    build clip metadata for each half
    """
    half1_clips = build_clips_for_half(
        game_dir, half_id=1,
        highlight_events=half1_h_events,
        half_duration_ms=half1_duration_ms
    )

    half2_clips = build_clips_for_half(
        game_dir, half_id=2,
        highlight_events=half2_h_events,
        half_duration_ms=half2_duration_ms
    )

    """
    save clip metadata to JSON
    """
    output_dir = f"{game_dir}/processed"
    os.makedirs(output_dir, exist_ok=True)

    with open(f"{output_dir}/clips.json", "w") as f:
        json.dump(half1_clips + half2_clips, f, indent=4)

    print(f"Saved clip metadata for {folder} @ {output_dir}/clips.json")

    """
    generate clips (full, video-only, audio-only) and empty transcript files
    """
    with open(f"{game_dir}/processed/clips.json") as f:
        clips = json.load(f)

    for clip in clips:
        half = clip["half"]
        start_ms = clip["start_ms"]

        input_mkv = f"{game_dir}/{half}_224p.mkv"
        
        outdir = f"{game_dir}/processed/half{half}"
        os.makedirs(outdir, exist_ok=True)

        # full clip
        cut_full_clip(input_mkv, start_ms, clip["full_clip_path"])

        # video-only
        cut_video_only(input_mkv, start_ms, clip["video_only_path"])

        # audio-only
        cut_audio_only(input_mkv, start_ms, clip["audio_path"])

        # empty transcript placeholder
        with open(clip["transcript_path"], "w") as t:
            t.write("")  # placeholder for future ASR

    print(f"All clips generated for {folder}. @ {outdir}")

    """
    run ASR on audio clips and save transcripts
    """
    metadata_path = f"{game_dir}/processed/clips.json"
    clips = json.load(open(metadata_path))

    for clip in clips:
        audio_path = clip["audio_path"]
        text_path  = clip["transcript_path"]
        
        # skip if audio missing
        if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
            print("Missing audio:", audio_path)
            continue
        
        # run ASR
        transcript = transcribe_audio(asr_model_small,audio_path)
        
        # write transcript
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(transcript)

    print(f"ASR transcription complete for {folder}!")
    print("\n")
    print("=" * 40)
    print("\n")


Saved clip metadata for 2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd @ data/raw/2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd/processed/clips.json
All clips generated for 2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd. @ data/raw/2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd/processed/half2

Running Whisper ASR...
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:05.000]  No Chelsea and slowly etched their way up the lead table since then.
[00:05.000 --> 00:08.000]  Newcastle remain in a precarious position.

TRANSCRIPT:
No Chelsea and slowly etched their way up the lead table since then. Newcastle remain in a precarious position.

Running Whisper ASR...
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:03.680]  Only goal difference keeping them out of the bottom three
[00:05.280 --> 00:07.900]  However, yo