In [1]:
import os
import json
import shutil
import re
import random
from collections import Counter, defaultdict

### Step 1:
Set up the combined dataset of video/audio/text clips

In [2]:
RAW_ROOT   = "data/raw"               # your 14 game folders (each contains processed/)
FINAL_ROOT = "data/full_processed"    # new unified dataset folder

# Create final folders
os.makedirs(f"{FINAL_ROOT}/video", exist_ok=True)
os.makedirs(f"{FINAL_ROOT}/audio", exist_ok=True)
os.makedirs(f"{FINAL_ROOT}/text",  exist_ok=True)

print("Paths ready.")

Paths ready.


### Step 2:
Normalize and save clip names to be unqiue per game (old: clip_000 -> new: game_half_clip_000)

In [3]:
def normalize_game_name(raw):
    """
    Convert messy game folder names into clean, machine-friendly IDs like:
    20160207_1900_Chelsea_1_1_Manchester_United
    """
    raw = raw.strip()
    raw = raw.replace(" - ", "_")       # convert separators
    raw = re.sub(r'\s+', '_', raw)      # replace spaces with _
    raw = re.sub(r'[^A-Za-z0-9_]', '', raw)  # remove weird chars

    return raw

In [4]:
# list games in deterministic order
game_folders = sorted([
    f for f in os.listdir(RAW_ROOT)
    if os.path.isdir(os.path.join(RAW_ROOT, f))
])

# Assign GAME01â€“GAME##
game_id_map = { g: normalize_game_name(g) for g in game_folders }

game_id_map

{'2016-02-07 - 19-00 Chelsea 1 - 1 Manchester United': '20160207_1900_Chelsea_1_1_Manchester_United',
 '2016-02-13 - 20-30 Chelsea 5 - 1 Newcastle Utd': '20160213_2030_Chelsea_5_1_Newcastle_Utd',
 '2016-02-14 - 19-15 Manchester City 1 - 2 Tottenham': '20160214_1915_Manchester_City_1_2_Tottenham',
 '2016-02-27 - 18-00 Southampton 1 - 2 Chelsea': '20160227_1800_Southampton_1_2_Chelsea',
 '2016-03-01 - 22-45 Norwich 1 - 2 Chelsea': '20160301_2245_Norwich_1_2_Chelsea',
 '2016-03-02 - 23-00 Liverpool 3 - 0 Manchester City': '20160302_2300_Liverpool_3_0_Manchester_City',
 '2016-03-05 - 18-00 Chelsea 1 - 1 Stoke City': '20160305_1800_Chelsea_1_1_Stoke_City',
 '2016-03-05 - 18-00 Manchester City 4 - 0 Aston Villa': '20160305_1800_Manchester_City_4_0_Aston_Villa',
 '2016-03-19 - 18-00 Chelsea 2 - 2 West Ham': '20160319_1800_Chelsea_2_2_West_Ham',
 '2016-03-20 - 19-00 Manchester City 0 - 1 Manchester United': '20160320_1900_Manchester_City_0_1_Manchester_United',
 '2016-04-09 - 17-00 Swansea 1 -

In [5]:
def build_base_name(game_id, half_id, idx):
    return f"{game_id}_half{half_id}_clip_{idx:04d}"

In [6]:
master_metadata = []

for game_name in game_folders:
    game_id = game_id_map[game_name]
    game_dir = os.path.join(RAW_ROOT, game_name)

    per_game_meta_path = os.path.join(game_dir, "processed", "clips.json")

    with open(per_game_meta_path, "r", encoding="utf-8") as f:
        clips = json.load(f)

    print(f"Processing {game_id}  |  {len(clips)} clips")

    # iterate with loop index
    for idx, clip in enumerate(clips):
        half = clip["half"]
        base = build_base_name(game_id, half, idx)

        # old paths
        old_video = clip["video_only_path"]
        old_audio = clip["audio_path"]
        old_text  = clip["transcript_path"]

        # new paths
        new_video = f"{FINAL_ROOT}/video/{base}.mkv"
        new_audio = f"{FINAL_ROOT}/audio/{base}.wav"
        new_text  = f"{FINAL_ROOT}/text/{base}.txt"

        # copy
        if os.path.exists(old_video): shutil.copy(old_video, new_video)
        if os.path.exists(old_audio): shutil.copy(old_audio, new_audio)
        if os.path.exists(old_text):  shutil.copy(old_text,  new_text)

        # build new metadata record
        master_metadata.append({
            "game_id": game_id,
            "original_game_name": game_name,
            "clip_name": base,
            "half": half,
            "start_ms": clip["start_ms"],
            "end_ms": clip["end_ms"],
            "highlight": clip["highlight"],

            # new paths
            "video": new_video,
            "audio": new_audio,
            "text": new_text,
        })

Processing 20160207_1900_Chelsea_1_1_Manchester_United  |  716 clips
Processing 20160213_2030_Chelsea_5_1_Newcastle_Utd  |  677 clips
Processing 20160214_1915_Manchester_City_1_2_Tottenham  |  684 clips
Processing 20160227_1800_Southampton_1_2_Chelsea  |  700 clips
Processing 20160301_2245_Norwich_1_2_Chelsea  |  689 clips
Processing 20160302_2300_Liverpool_3_0_Manchester_City  |  676 clips
Processing 20160305_1800_Chelsea_1_1_Stoke_City  |  685 clips
Processing 20160305_1800_Manchester_City_4_0_Aston_Villa  |  682 clips
Processing 20160319_1800_Chelsea_2_2_West_Ham  |  714 clips
Processing 20160320_1900_Manchester_City_0_1_Manchester_United  |  676 clips
Processing 20160409_1700_Swansea_1_0_Chelsea  |  676 clips
Processing 20160423_1700_Bournemouth_1_4_Chelsea  |  686 clips
Processing 20160507_1700_Sunderland_3_2_Chelsea  |  743 clips


Save the new combined clips.json with al metadata

In [7]:
final_json = f"{FINAL_ROOT}/clips.json"

with open(final_json, "w", encoding="utf-8") as f:
    json.dump(master_metadata, f, indent=2)

print("Saved:", final_json)
print("Total clips:", len(master_metadata))

Saved: data/full_processed/clips.json
Total clips: 9004


In [8]:
# sanity check: sample a few records
sample = random.choice(master_metadata)
print(json.dumps(sample, indent=2))
print(os.path.exists(sample["video"]),
      os.path.exists(sample["audio"]),
      os.path.exists(sample["text"]))

{
  "game_id": "20160301_2245_Norwich_1_2_Chelsea",
  "original_game_name": "2016-03-01 - 22-45 Norwich 1 - 2 Chelsea",
  "clip_name": "20160301_2245_Norwich_1_2_Chelsea_half2_clip_0357",
  "half": 2,
  "start_ms": 48000,
  "end_ms": 56000,
  "highlight": 0,
  "video": "data/full_processed/video/20160301_2245_Norwich_1_2_Chelsea_half2_clip_0357.mkv",
  "audio": "data/full_processed/audio/20160301_2245_Norwich_1_2_Chelsea_half2_clip_0357.wav",
  "text": "data/full_processed/text/20160301_2245_Norwich_1_2_Chelsea_half2_clip_0357.txt"
}
True True True


### Step 3:
Find current class imbalance ratio

In [9]:
# Load metadata
with open(final_json, "r", encoding="utf-8") as f:
    clips = json.load(f)

# Count highlights (1) and non-highlights (0)
labels = [c["highlight"] for c in clips]
counter = Counter(labels)

num_neg = counter.get(0, 0)
num_pos = counter.get(1, 0)
total   = num_neg + num_pos

print("Total clips:", total)
print("Non-Highlights (0):", num_neg)
print("Highlights (1):", num_pos)

print("\nPercentage breakdown:")
print(f"Non-highlights: {num_neg/total*100:.2f}%")
print(f"Highlights:     {num_pos/total*100:.2f}%")

print("\nImbalance ratio (neg : pos):")
if num_pos > 0:
    print(f"{num_neg / num_pos:.2f} : 1")
else:
    print("No positive clips at all.")


Total clips: 9004
Non-Highlights (0): 8701
Highlights (1): 303

Percentage breakdown:
Non-highlights: 96.63%
Highlights:     3.37%

Imbalance ratio (neg : pos):
28.72 : 1


Randomly downsample to 4:1 neg:pos ratio and save paths

In [10]:
FINAL_ROOT = "data/full_processed"
final_json = f"{FINAL_ROOT}/clips.json"

with open(final_json, "r", encoding="utf-8") as f:
    clips = json.load(f)

# Separate positives and negatives
positives = [c for c in clips if c["highlight"] == 1]
negatives = [c for c in clips if c["highlight"] == 0]

num_pos = len(positives)
num_neg = len(negatives)

print("Original ratio:", num_neg/num_pos, ": 1")

# Target ratio = 4:1
target_neg = 4 * num_pos

# Downsample negatives
negatives_downsampled = random.sample(negatives, target_neg)

balanced_clips = positives + negatives_downsampled
random.shuffle(balanced_clips)

print("New totals:")
print("Positives:", len(positives))
print("Negatives:", len(negatives_downsampled))
print("Ratio:", len(negatives_downsampled)/len(positives), ": 1")

# Save to new balanced metadata
with open(f"{FINAL_ROOT}/clips_balanced.json", "w", encoding="utf-8") as f:
    json.dump(balanced_clips, f, indent=2)

print("\nSaved balanced dataset to clips_balanced.json")


Original ratio: 28.716171617161717 : 1
New totals:
Positives: 303
Negatives: 1212
Ratio: 4.0 : 1

Saved balanced dataset to clips_balanced.json


### Step 4:
Set up train/val/test directories

In [11]:
FULL_ROOT = "data/full_processed"
with open(f"{FULL_ROOT}/clips_balanced.json", "r", encoding="utf-8") as f:
    clips = json.load(f)

print("Total clips:", len(clips))

Total clips: 1515


In [12]:
SPLIT_ROOT = "data/final"

splits = ["train", "val", "test"]

for s in splits:
    for sub in ["video", "audio", "text"]:
        os.makedirs(f"{SPLIT_ROOT}/{s}/{sub}", exist_ok=True)

### Step 5:
Find splits by game_id - not randomly

(we dont want to train on clips from some game such that there will be no clips from that same game when testing)

In [13]:
clips_by_game = defaultdict(list)

for c in clips:
    clips_by_game[c["game_id"]].append(c)

games = sorted(clips_by_game.keys())
print("Total games:", len(games))

Total games: 13


In [14]:
random.seed(42)

random.shuffle(games)

num_games = len(games)
train_end = int(0.7 * num_games)
val_end   = int(0.85 * num_games)

train_games = games[:train_end]
val_games   = games[train_end:val_end]
test_games  = games[val_end:]

print("Train:", len(train_games))
print("Val:", len(val_games))
print("Test:", len(test_games))


Train: 9
Val: 2
Test: 2


### Step 6:
Copy over the split clips into train/val/test and save metadata for each

In [15]:
def copy_clip(clip, split_name):
    base = clip["clip_name"]  # consistent base name

    # Source paths
    src_video = clip["video"]
    src_audio = clip["audio"]
    src_text  = clip["text"]

    # Dest paths
    dst_video = f"{SPLIT_ROOT}/{split_name}/video/{base}.mkv"
    dst_audio = f"{SPLIT_ROOT}/{split_name}/audio/{base}.wav"
    dst_text  = f"{SPLIT_ROOT}/{split_name}/text/{base}.txt"

    # Copy files
    shutil.copy(src_video, dst_video)
    shutil.copy(src_audio, dst_audio)
    shutil.copy(src_text,  dst_text)

    # Return updated metadata
    new_clip = dict(clip)
    new_clip["video"] = dst_video
    new_clip["audio"] = dst_audio
    new_clip["text"]  = dst_text
    return new_clip


In [16]:
train_meta, val_meta, test_meta = [], [], []

for game in train_games:
    for clip in clips_by_game[game]:
        train_meta.append(copy_clip(clip, "train"))

for game in val_games:
    for clip in clips_by_game[game]:
        val_meta.append(copy_clip(clip, "val"))

for game in test_games:
    for clip in clips_by_game[game]:
        test_meta.append(copy_clip(clip, "test"))

print("Train clips:", len(train_meta))
print("Val clips:", len(val_meta))
print("Test clips:", len(test_meta))


Train clips: 1072
Val clips: 233
Test clips: 210


In [17]:
with open(f"{SPLIT_ROOT}/train/clips.json", "w", encoding="utf-8") as f:
    json.dump(train_meta, f, indent=2)

with open(f"{SPLIT_ROOT}/val/clips.json", "w", encoding="utf-8") as f:
    json.dump(val_meta, f, indent=2)

with open(f"{SPLIT_ROOT}/test/clips.json", "w", encoding="utf-8") as f:
    json.dump(test_meta, f, indent=2)

print("Metadata written successfully.")

Metadata written successfully.
