In [1]:
import json
import cv2
import os
import numpy as np
from tqdm.notebook import tqdm
import glob
import shutil
from collections import defaultdict
import gc

INPUT_DIR = '/kaggle/input/olympic-boxing-punch-classification-video-dataset/Olympic Boxing Punch Classification Video Dataset'
OUTPUT_DIR = '/kaggle/working/processed_dataset'


if os.path.exists(OUTPUT_DIR): shutil.rmtree(OUTPUT_DIR)


categories = ['Jab', 'Cross', 'Miss'] 
for cat in categories: os.makedirs(f"{OUTPUT_DIR}/{cat}", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/punch_imgs/Land", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/punch_imgs/Miss", exist_ok=True)

def process_video(video_path):
    video_dir = os.path.dirname(video_path)
    parent_dir = os.path.dirname(video_dir)
    json_path = os.path.join(video_dir, 'annotations.json')
    if not os.path.exists(json_path): json_path = os.path.join(parent_dir, 'annotations.json')
    if not os.path.exists(json_path): json_path = video_path.replace('.mp4', '.json')
    
    if not os.path.exists(json_path): return 0

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
    except: return 0

    tracks = []
    if isinstance(raw_data, list) and len(raw_data) > 0: tracks = raw_data[0].get('tracks', [])
    elif isinstance(raw_data, dict): tracks = raw_data.get('tracks', [])
    
    if not tracks: return 0

    raw_punches = defaultdict(list)
    for annotation in tracks:
        grp_id = annotation.get('group')
        if grp_id is not None and annotation.get('frame') is not None:
            raw_punches[grp_id].append(annotation)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): return 0
    video_name = os.path.basename(video_path).split('.')[0]

    files_generated = 0
    
    for grp_id, annotations in raw_punches.items():
        annotations.sort(key=lambda x: x['frame'])
        
        clusters = []
        current_cluster = []
        
        for ann in annotations:
            if not current_cluster:
                current_cluster.append(ann)
            else:
                last_frame = current_cluster[-1]['frame']
                if ann['frame'] - last_frame < 10: 
                    current_cluster.append(ann)
                else:
                    clusters.append(current_cluster)
                    current_cluster = [ann]
        if current_cluster: clusters.append(current_cluster)
        
        for cluster in clusters:
            if not cluster: continue
            
            label_text = ""
            for f in cluster:
                lbl = f.get('label', '')
                if lbl:
                    label_text = lbl
                    break
            
            if not label_text: continue
            
            # تصنيف
            txt = label_text.lower()
            action_class = None
            hit_status = "Land"
            
            if 'lew' in txt: # Left -> Jab
                if 'chybienie' in txt: 
                    action_class = "Miss"; hit_status = "Miss"
                else: action_class = "Jab"
            elif 'praw' in txt: # Right -> Cross
                if 'chybienie' in txt: 
                    action_class = "Miss"; hit_status = "Miss"
                else: action_class = "Cross"

            if not action_class: continue

            frame_nums = [f['frame'] for f in cluster]
            start = min(frame_nums)
            end = max(frame_nums)
            length = end - start
            
            if length < 2: continue

            save_name = f"{video_name}_grp{grp_id}_fr{start}"
            save_path = f"{OUTPUT_DIR}/{action_class}/{save_name}.mp4"
            img_path = f"{OUTPUT_DIR}/punch_imgs/{hit_status}/{save_name}.jpg"
            
            cap.set(cv2.CAP_PROP_POS_FRAMES, start)
            
            if action_class != "Miss":
                out = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), 20, (224, 224))
            
            mid_idx = length // 2
            
            for i in range(length + 2): 
                ret, frame = cap.read()
                if not ret: break
                
                frame_resized = cv2.resize(frame, (224, 224))
                
                if action_class != "Miss":
                    out.write(frame_resized)
                
                if i == mid_idx:
                    cv2.imwrite(img_path, frame_resized)
            
            if action_class != "Miss":
                out.release()
                files_generated += 1

    cap.release()
    return files_generated

video_files = glob.glob(f"{INPUT_DIR}/**/*.mp4", recursive=True)
print(f"Found {len(video_files)} videos. Processing (Time Clustering)...")

if len(video_files) > 0:
    print(f"Testing first video...")
    c = process_video(video_files[0])
    print(f"Result: Generated {c} clips from first video.") 

print("Processing ALL videos...")
total = 0
for vid in tqdm(video_files):
    total += process_video(vid)

print(f"Final Check: Total clips generated: {total}")

Found 29 videos. Processing (Time Clustering)...
Testing first video...
Result: Generated 12 clips from first video.
Processing ALL videos...


  0%|          | 0/29 [00:00<?, ?it/s]

Final Check: Total clips generated: 510


In [2]:
import cv2
import os
import glob
from tqdm.notebook import tqdm

EXTERNAL_HOOKS_DIR = '/kaggle/input/hookdataset' 

DEST_DIR = '/kaggle/working/processed_dataset/Hook'
IMG_DEST_DIR = '/kaggle/working/processed_dataset/punch_imgs/Land'

os.makedirs(DEST_DIR, exist_ok=True)
os.makedirs(IMG_DEST_DIR, exist_ok=True)

print(f" Phase 2: Injecting Hooks from {EXTERNAL_HOOKS_DIR}...")

hook_files = glob.glob(f"{EXTERNAL_HOOKS_DIR}/**/*.mov", recursive=True) + \
             glob.glob(f"{EXTERNAL_HOOKS_DIR}/**/*.mp4", recursive=True) + \
             glob.glob(f"{EXTERNAL_HOOKS_DIR}/**/*.MOV", recursive=True) + \
             glob.glob(f"{EXTERNAL_HOOKS_DIR}/**/*.MP4", recursive=True)

if len(hook_files) == 0:
    print("Still 0! Listing folder content to debug:")
    print(os.listdir(EXTERNAL_HOOKS_DIR))
else:
    print(f"Found {len(hook_files)} Hook videos. Processing...")

    count = 0
    for vid_path in tqdm(hook_files):
        cap = cv2.VideoCapture(vid_path)
        if not cap.isOpened(): continue
        
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret: break
            # Resize
            frame = cv2.resize(frame, (224, 224))
            frames.append(frame)
        cap.release()
        
        if len(frames) > 5:
            vid_name = os.path.basename(vid_path).split('.')[0]
            
            # Save Video
            save_path = f"{DEST_DIR}/ext_hook_{count}_{vid_name}.mp4"
            out = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), 20, (224, 224))
            for f in frames: out.write(f)
            out.release()
            
            # Save Image
            mid_frame = frames[len(frames)//2]
            cv2.imwrite(f"{IMG_DEST_DIR}/ext_hook_{count}_{vid_name}.jpg", mid_frame)
            
            count += 1

    print(f"DONE! Successfully added {count} Hook videos.")
    
    print("\n" + "="*40)
    print(" FINAL DATASET READY FOR TRAINING:")
    total_all = 0
    for cat in ['Jab', 'Cross', 'Hook', 'Miss']:
        dir_path = f'/kaggle/working/processed_dataset/{cat}'
        if os.path.exists(dir_path):
            num = len(os.listdir(dir_path))
            print(f" {cat}: {num} videos")
            total_all += num
        else:
            print(f" {cat}: 0 (Folder created now)")

    print("="*40)
    print(f"GRAND TOTAL: {total_all} Samples.")

 Phase 2: Injecting Hooks from /kaggle/input/hookdataset...
Found 95 Hook videos. Processing...


  0%|          | 0/95 [00:00<?, ?it/s]

DONE! Successfully added 95 Hook videos.

 FINAL DATASET READY FOR TRAINING:
 Jab: 377 videos
 Cross: 133 videos
 Hook: 95 videos
 Miss: 0 videos
GRAND TOTAL: 605 Samples.


In [3]:
import shutil
import os
from IPython.display import FileLink

output_filename = "Final_Boxing_Dataset_Ready"
dir_to_zip = "/kaggle/working/processed_dataset"

print("Zipping the dataset... This might take a minute...")


shutil.make_archive(output_filename, 'zip', dir_to_zip)

print("Zipping Complete!")
print("Click the link below to download your dataset to your PC:")

FileLink(f"{output_filename}.zip")

Zipping the dataset... This might take a minute...
Zipping Complete!
Click the link below to download your dataset to your PC:
