In [55]:
import json
import shutil
import os
from pathlib import Path

In [58]:
base_path = Path(os.getcwd()).joinpath("data")
final_folders = [base_path / "final_1", base_path / "final_2", base_path / "final_3"]
output_path = base_path / "data"

In [59]:
# Create output directories
for split in ["train", "val", "test"]:
    split_path = output_path / split
    split_path.mkdir(parents=True, exist_ok=True)
    for modality in ["audio", "video", "text"]:
        (split_path / modality).mkdir(parents=True, exist_ok=True)

print("Created output directory structure")

Created output directory structure


In [61]:

# Combine data
for split in ["train", "val", "test"]:
    combined_clips = []
    
    # Collect from each final folder
    for final_folder in final_folders:
        source_split_path = final_folder / split
        clips_file = source_split_path / "clips.json"
        
        if clips_file.exists():
            print(f"Processing {final_folder.name}/{split}/clips.json")
            
            with open(clips_file, 'r') as f:
                clips = json.load(f)
            
            # Copy files and update paths
            for clip in clips:
                # Copy audio
                if clip.get("audio"):
                    # Extract just the filename from the path
                    audio_filename = clip["audio"].split("/")[-1]
                    src_audio = source_split_path / "audio" / audio_filename
                    dst_audio = output_path / split / "audio" / audio_filename
                    if src_audio.exists():
                        shutil.copy2(src_audio, dst_audio)
                    clip["audio"] = f"data/data/{split}/audio/{audio_filename}"
                
                # Copy video
                if clip.get("video"):
                    # Extract just the filename from the path
                    video_filename = clip["video"].split("/")[-1]
                    src_video = source_split_path / "video" / video_filename
                    dst_video = output_path / split / "video" / video_filename
                    if src_video.exists():
                        shutil.copy2(src_video, dst_video)
                    clip["video"] = f"data/data/{split}/video/{video_filename}"
                
                # Copy text
                if clip.get("text"):
                    # Extract just the filename from the path
                    text_filename = clip["text"].split("/")[-1]
                    src_text = source_split_path / "text" / text_filename
                    dst_text = output_path / split / "text" / text_filename
                    if src_text.exists():
                        shutil.copy2(src_text, dst_text)
                    clip["text"] = f"data/data/{split}/text/{text_filename}"
                
                combined_clips.append(clip)
    
    # Write combined clips.json
    output_clips_file = output_path / split / "clips.json"
    with open(output_clips_file, 'w') as f:
        json.dump(combined_clips, f, indent=2)
    
    print(f"Combined {split}: {len(combined_clips)} clips")

print("\nDone! Combined data saved to data/")



Processing final_1/train/clips.json
Processing final_2/train/clips.json
Processing final_2/train/clips.json
Processing final_3/train/clips.json
Processing final_3/train/clips.json
Combined train: 3113 clips
Processing final_1/val/clips.json
Combined train: 3113 clips
Processing final_1/val/clips.json
Processing final_2/val/clips.json
Processing final_2/val/clips.json
Processing final_3/val/clips.json
Processing final_3/val/clips.json
Combined val: 664 clips
Processing final_1/test/clips.json
Combined val: 664 clips
Processing final_1/test/clips.json
Processing final_2/test/clips.json
Processing final_2/test/clips.json
Processing final_3/test/clips.json
Processing final_3/test/clips.json
Combined test: 778 clips

Done! Combined data saved to data/
Combined test: 778 clips

Done! Combined data saved to data/


SANITY CHECK: Clip Count Verification
final_1/train: 1072 clips
final_1/val: 233 clips
final_1/test: 210 clips
final_2/train: 1194 clips
final_2/val: 308 clips
final_2/test: 303 clips
final_3/train: 847 clips
final_3/val: 123 clips
final_3/test: 265 clips

------------------------------------------------------------
Combined counts:
------------------------------------------------------------
data/data/train: 3113 clips
data/data/val: 664 clips
data/data/test: 778 clips

------------------------------------------------------------
Verification:
------------------------------------------------------------
train  - Expected:   3113 | Actual:   3113 | ‚úì
val    - Expected:    664 | Actual:    664 | ‚úì
test   - Expected:    778 | Actual:    778 | ‚úì

‚úì All counts match! Data combination successful!


In [63]:

# Import required libraries for spectrogram conversion using torchaudio
import torchaudio
import torchaudio.transforms as T
import matplotlib.pyplot as plt
import numpy as np
import torch

print("Imported torchaudio and related libraries")


Imported torchaudio and related libraries


In [65]:

# Function to convert WAV to mel-spectrogram JPG using torchaudio
def wav_to_melspectrogram_jpg(wav_path, jpg_path, sr=22050, n_mels=128, n_fft=2048, hop_length=512):
    """
    Convert a WAV file to a mel-spectrogram and save as JPG using torchaudio.
    
    Args:
        wav_path: Path to input WAV file
        jpg_path: Path to output JPG file
        sr: Sample rate
        n_mels: Number of mel bands
        n_fft: FFT window size
        hop_length: Number of samples between successive frames
    """
    try:
        # Load audio file
        waveform, sample_rate = torchaudio.load(wav_path)
        
        # Resample if necessary
        if sample_rate != sr:
            resampler = T.Resample(sample_rate, sr)
            waveform = resampler(waveform)
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Create mel-spectrogram transform
        mel_spectrogram = T.MelSpectrogram(
            sample_rate=sr,
            n_mels=n_mels,
            n_fft=n_fft,
            hop_length=hop_length
        )
        
        # Compute mel-spectrogram
        mel_spec = mel_spectrogram(waveform)
        
        # Convert to dB scale
        mel_spec_db = T.AmplitudeToDB()(mel_spec)
        
        # Remove batch dimension and convert to numpy
        mel_spec_np = mel_spec_db.squeeze().numpy()
        
        # Create figure and plot spectrogram
        fig, ax = plt.subplots(figsize=(10, 4))
        im = ax.imshow(mel_spec_np, aspect='auto', origin='lower', cmap='viridis')
        ax.set_ylabel('Mel Frequency Bin')
        ax.set_xlabel('Time Frame')
        plt.tight_layout()
        
        # Save as JPG
        plt.savefig(jpg_path, format='jpg', bbox_inches='tight', pad_inches=0, dpi=100)
        plt.close()
        
        return True
    except Exception as e:
        print(f"Error converting {wav_path}: {str(e)}")
        return False

In [None]:
# Batch convert all WAV files to mel-spectrograms for all splits
print("=" * 60)
print("Converting all WAV files to mel-spectrograms...")
print("=" * 60)

for split in ["train", "val", "test"]:
    clips_file = output_path / split / "clips.json"
    
    if clips_file.exists():
        print(f"\nProcessing {split} split...")
        
        # Load clips
        with open(clips_file, 'r') as f:
            clips = json.load(f)
        
        # Create audio_img directory if it doesn't exist
        audio_img_dir = output_path / split / "audio_img"
        audio_img_dir.mkdir(parents=True, exist_ok=True)
        
        converted_count = 0
        failed_count = 0
        
        # Convert each audio file
        for i, clip in enumerate(clips):
            if clip.get("audio"):
                # Get the audio filename
                audio_filename = clip["audio"].split("/")[-1]
                audio_path = output_path / split / "audio" / audio_filename
                
                # Create spectrogram path
                spectrogram_filename = audio_filename.replace(".wav", ".jpg")
                spectrogram_path = audio_img_dir / spectrogram_filename
                
                # Convert to spectrogram
                if audio_path.exists():
                    success = wav_to_melspectrogram_jpg(str(audio_path), str(spectrogram_path))
                    if success:
                        # Add audio_img field to clip
                        clip["audio_img"] = f"data/data/{split}/audio_img/{spectrogram_filename}"
                        converted_count += 1
                        if (i + 1) % 50 == 0:
                            print(f"  Converted {i + 1}/{len(clips)} clips...")
                    else:
                        failed_count += 1
                else:
                    print(f"  Audio file not found: {audio_path}")
                    failed_count += 1
        
        # Write updated clips.json
        with open(clips_file, 'w') as f:
            json.dump(clips, f, indent=2)
        
        print(f"  ‚úì {split}: {converted_count} converted, {failed_count} failed")

print("\n" + "=" * 60)
print("‚úì Spectrogram conversion complete!")
print("=" * 60)


In [81]:

# Verify spectrogram conversion and updated clips.json
print("=" * 60)
print("VERIFICATION: Spectrogram Conversion Complete")
print("=" * 60)

for split in ["train", "val", "test"]:
    audio_img_dir = output_path / split / "audio_img"
    jpg_files = list(audio_img_dir.glob("*.jpg")) if audio_img_dir.exists() else []
    
    clips_file = output_path / split / "clips.json"
    with open(clips_file, 'r') as f:
        clips = json.load(f)
    
    clips_with_audio_img = sum(1 for clip in clips if "audio_img" in clip)
    
    print(f"\n{split.upper()}:")
    print(f"  JPG files created: {len(jpg_files)}")
    print(f"  Clips with audio_img field: {clips_with_audio_img}/{len(clips)}")

print("\n" + "-" * 60)
print("Sample clip (first clip of train split):")
print("-" * 60)

with open(output_path / "train" / "clips.json", 'r') as f:
    clips = json.load(f)
    if clips:
        sample_clip = clips[0]
        print(json.dumps(sample_clip, indent=2))


VERIFICATION: Spectrogram Conversion Complete

TRAIN:
  JPG files created: 3111
  Clips with audio_img field: 3111/3111

VAL:
  JPG files created: 664
  Clips with audio_img field: 664/664

TEST:
  JPG files created: 777
  Clips with audio_img field: 777/777

------------------------------------------------------------
Sample clip (first clip of train split):
------------------------------------------------------------
{
  "game_id": "20160305_1800_Manchester_City_4_0_Aston_Villa",
  "original_game_name": "2016-03-05 - 18-00 Manchester City 4 - 0 Aston Villa",
  "clip_name": "20160305_1800_Manchester_City_4_0_Aston_Villa_half2_clip_0345",
  "half": 2,
  "start_ms": 8000,
  "end_ms": 16000,
  "highlight": 0,
  "video": "data/data/train/video/20160305_1800_Manchester_City_4_0_Aston_Villa_half2_clip_0345.mkv",
  "audio": "data/data/train/audio/20160305_1800_Manchester_City_4_0_Aston_Villa_half2_clip_0345.wav",
  "text": "data/data/train/text/20160305_1800_Manchester_City_4_0_Aston_Villa_h

In [82]:

# Find which audio files are missing spectrograms
print("=" * 60)
print("Finding missing spectrogram conversions...")
print("=" * 60)

missing_files = []

for split in ["train", "val", "test"]:
    clips_file = output_path / split / "clips.json"
    audio_img_dir = output_path / split / "audio_img"
    
    if clips_file.exists():
        # Load clips
        with open(clips_file, 'r') as f:
            clips = json.load(f)
        
        # Check which clips are missing audio_img
        print(f"\n{split.upper()} split:")
        split_missing = []
        
        for clip in clips:
            if clip.get("audio") and "audio_img" not in clip:
                audio_filename = clip["audio"].split("/")[-1]
                audio_path = output_path / split / "audio" / audio_filename
                
                split_missing.append({
                    "clip_name": clip.get("clip_name"),
                    "audio_file": audio_filename,
                    "audio_exists": audio_path.exists(),
                    "full_path": str(audio_path)
                })
        
        if split_missing:
            print(f"  Found {len(split_missing)} clips without audio_img field:")
            for item in split_missing:
                print(f"    - {item['clip_name']}")
                print(f"      Audio file: {item['audio_file']}")
                print(f"      File exists: {item['audio_exists']}")
                print(f"      Path: {item['full_path']}\n")
            missing_files.extend(split_missing)
        else:
            print(f"  ‚úì All clips have audio_img field")

print("\n" + "=" * 60)
print(f"SUMMARY: {len(missing_files)} missing spectrogram conversions")
print("=" * 60)

if missing_files:
    print("\nDetailed list of missing files:")
    for i, item in enumerate(missing_files, 1):
        print(f"\n{i}. Clip: {item['clip_name']}")
        print(f"   Audio file exists: {item['audio_exists']}")
        if not item['audio_exists']:
            print(f"   ‚ö†Ô∏è  Audio file not found at: {item['full_path']}")


Finding missing spectrogram conversions...

TRAIN split:
  ‚úì All clips have audio_img field

VAL split:
  ‚úì All clips have audio_img field

TEST split:
  ‚úì All clips have audio_img field

SUMMARY: 0 missing spectrogram conversions


In [75]:
# Retry converting missing audio files
print("=" * 60)
print("Retrying conversion of missing audio files...")
print("=" * 60)

retry_count = 0
retry_failed = 0

for split in ["train", "val", "test"]:
    clips_file = output_path / split / "clips.json"
    
    if clips_file.exists():
        # Load clips
        with open(clips_file, 'r') as f:
            clips = json.load(f)
        
        audio_img_dir = output_path / split / "audio_img"
        audio_img_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"\n{split.upper()} split:")
        split_retry_count = 0
        split_retry_failed = 0
        
        # Find and retry missing conversions
        for clip in clips:
            if clip.get("audio") and "audio_img" not in clip:
                audio_filename = clip["audio"].split("/")[-1]
                audio_path = output_path / split / "audio" / audio_filename
                
                if audio_path.exists():
                    print(f"  Retrying: {clip.get('clip_name')}")
                    
                    # Create spectrogram path
                    spectrogram_filename = audio_filename.replace(".wav", ".jpg")
                    spectrogram_path = audio_img_dir / spectrogram_filename
                    
                    # Try conversion
                    try:
                        success = wav_to_melspectrogram_jpg(str(audio_path), str(spectrogram_path))
                        if success:
                            # Add audio_img field to clip
                            clip["audio_img"] = f"data/data/{split}/audio_img/{spectrogram_filename}"
                            split_retry_count += 1
                            print(f"    ‚úì Successfully converted")
                        else:
                            split_retry_failed += 1
                            print(f"    ‚úó Conversion failed")
                    except Exception as e:
                        split_retry_failed += 1
                        print(f"    ‚úó Error: {str(e)}")
                else:
                    split_retry_failed += 1
                    print(f"  ‚úó Audio file not found: {audio_filename}")
        
        # Write updated clips.json
        with open(clips_file, 'w') as f:
            json.dump(clips, f, indent=2)
        
        print(f"  Retry results: {split_retry_count} converted, {split_retry_failed} failed")
        retry_count += split_retry_count
        retry_failed += split_retry_failed

print("\n" + "=" * 60)
print(f"Retry Summary: {retry_count} successfully converted, {retry_failed} still failing")
print("=" * 60)

Retrying conversion of missing audio files...

TRAIN split:
  Retrying: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024
Error converting c:\Users\golde\Documents\UMD\CMSC498K - Multimodal Deep Learning\FinalProject\data\data\train\audio\20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.wav: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous
    ‚úó Conversion failed
  Retrying: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021
Error converting c:\Users\golde\Documents\UMD\CMSC498K - Multimodal Deep Learning\FinalProject\data\data\train\audio\20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021.wav: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous
    ‚úó Conversion failed
  Retry results: 0 converted, 2 failed

VAL split:
  Retry results: 0 converted, 0 failed

TEST split:
  Retrying: 20150926_1700_Manchester_United



In [77]:
# Final verification after retry
print("=" * 60)
print("FINAL VERIFICATION: After Retry Conversion")
print("=" * 60)

total_clips = 0
total_with_audio_img = 0

for split in ["train", "val", "test"]:
    clips_file = output_path / split / "clips.json"
    
    if clips_file.exists():
        with open(clips_file, 'r') as f:
            clips = json.load(f)
        
        audio_img_dir = output_path / split / "audio_img"
        jpg_files = list(audio_img_dir.glob("*.jpg")) if audio_img_dir.exists() else []
        
        clips_with_audio_img = sum(1 for clip in clips if "audio_img" in clip)
        total_clips += len(clips)
        total_with_audio_img += clips_with_audio_img
        
        print(f"\n{split.upper()}:")
        print(f"  Total clips: {len(clips)}")
        print(f"  JPG files: {len(jpg_files)}")
        print(f"  Clips with audio_img: {clips_with_audio_img}")
        
        # Still missing?
        still_missing = len(clips) - clips_with_audio_img
        if still_missing > 0:
            print(f"  ‚ö†Ô∏è  Still missing: {still_missing}")
            
            # List the still-missing ones
            for clip in clips:
                if clip.get("audio") and "audio_img" not in clip:
                    print(f"     - {clip.get('clip_name')}: {clip.get('audio')}")
        else:
            print(f"  ‚úì All clips have spectrograms!")

print("\n" + "=" * 60)
print(f"OVERALL: {total_with_audio_img}/{total_clips} clips with spectrograms")
if total_with_audio_img == total_clips:
    print("‚úì ALL CONVERSIONS COMPLETE!")
else:
    print(f"‚ö†Ô∏è  Missing: {total_clips - total_with_audio_img}")
print("=" * 60)

FINAL VERIFICATION: After Retry Conversion

TRAIN:
  Total clips: 3113
  JPG files: 3111
  Clips with audio_img: 3111
  ‚ö†Ô∏è  Still missing: 2
     - 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024: data/data/train/audio/20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.wav
     - 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021: data/data/train/audio/20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021.wav

VAL:
  Total clips: 664
  JPG files: 664
  Clips with audio_img: 664
  ‚úì All clips have spectrograms!

TEST:
  Total clips: 778
  JPG files: 777
  Clips with audio_img: 777
  ‚ö†Ô∏è  Still missing: 1
     - 20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710: data/data/test/audio/20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710.wav

OVERALL: 4552/4555 clips with spectrograms
‚ö†Ô∏è  Missing: 3


In [79]:

# Inspect the 3 corrupted audio files and their neighbors
from IPython.display import Audio, display
import soundfile as sf

corrupted_files = [
    ("train", "20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024"),
    ("train", "20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021"),
    ("test", "20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710")
]

print("=" * 80)
print("ANALYZING CORRUPTED AUDIO FILES AND NEIGHBORS")
print("=" * 80)

for split, clip_name in corrupted_files:
    print(f"\n{'=' * 80}")
    print(f"SPLIT: {split.upper()} | CLIP: {clip_name}")
    print('=' * 80)
    
    # Load clips.json
    clips_file = output_path / split / "clips.json"
    with open(clips_file, 'r') as f:
        clips = json.load(f)
    
    # Find the clip index
    clip_idx = None
    for idx, clip in enumerate(clips):
        if clip.get("clip_name") == clip_name:
            clip_idx = idx
            break
    
    if clip_idx is not None:
        # Get clips before, current, and after
        start_idx = max(0, clip_idx - 1)
        end_idx = min(len(clips), clip_idx + 2)
        
        for idx in range(start_idx, end_idx):
            current_clip = clips[idx]
            is_corrupted = current_clip.get("clip_name") == clip_name
            
            print(f"\n{'-' * 80}")
            print(f"Clip #{idx}: {current_clip.get('clip_name')}")
            if is_corrupted:
                print("‚ö†Ô∏è  CORRUPTED FILE")
            print(f"Audio: {current_clip.get('audio')}")
            print(f"Has audio_img: {'audio_img' in current_clip}")
            print(f"-" * 80)
            
            # Try to load and display audio info
            audio_filename = current_clip.get("audio", "").split("/")[-1]
            audio_path = output_path / split / "audio" / audio_filename
            
            if audio_path.exists():
                try:
                    # Get audio info
                    waveform, sample_rate = torchaudio.load(str(audio_path))
                    duration_ms = (waveform.shape[1] / sample_rate) * 1000
                    
                    print(f"Sample rate: {sample_rate} Hz")
                    print(f"Channels: {waveform.shape[0]}")
                    print(f"Samples: {waveform.shape[1]}")
                    print(f"Duration: {duration_ms:.2f} ms")
                    print(f"Min value: {waveform.min():.4f}")
                    print(f"Max value: {waveform.max():.4f}")
                    
                    if is_corrupted:
                        print("\nüìä Audio Player:")
                        display(Audio(str(audio_path)))
                    
                except Exception as e:
                    print(f"‚ùå Error loading audio: {str(e)}")
            else:
                print(f"‚ùå Audio file not found: {audio_path}")

print(f"\n{'=' * 80}")
print("END OF ANALYSIS")
print('=' * 80)


ANALYZING CORRUPTED AUDIO FILES AND NEIGHBORS

SPLIT: TRAIN | CLIP: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024

--------------------------------------------------------------------------------
Clip #2597: 20151107_2030_Stoke_City_1_0_Chelsea_half2_clip_0454
Audio: data/data/train/audio/20151107_2030_Stoke_City_1_0_Chelsea_half2_clip_0454.wav
Has audio_img: True
--------------------------------------------------------------------------------
Sample rate: 16000 Hz
Channels: 1
Samples: 128000
Duration: 8000.00 ms
Min value: -0.2256
Max value: 0.2514

--------------------------------------------------------------------------------
Clip #2598: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024
‚ö†Ô∏è  CORRUPTED FILE
Audio: data/data/train/audio/20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.wav
Has audio_img: False
--------------------------------------------------------------------------------
Sample rate: 16000 Hz
Channels: 1
Samples: 0
Duration: 0.00 ms
‚ùå Error loadin


--------------------------------------------------------------------------------
Clip #383: 20150926_1700_Manchester_United_3_0_Sunderland_half1_clip_0112
Audio: data/data/test/audio/20150926_1700_Manchester_United_3_0_Sunderland_half1_clip_0112.wav
Has audio_img: True
--------------------------------------------------------------------------------
Sample rate: 16000 Hz
Channels: 1
Samples: 128000
Duration: 8000.00 ms
Min value: -0.3920
Max value: 0.3854

END OF ANALYSIS


In [80]:

# Remove the 3 corrupted audio clips from all data and update clips.json
print("=" * 80)
print("Removing corrupted clips from data...")
print("=" * 80)

corrupted_clips_to_remove = [
    ("train", "20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024"),
    ("train", "20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021"),
    ("test", "20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710")
]

total_removed = 0

for split, clip_name_to_remove in corrupted_clips_to_remove:
    print(f"\n{'=' * 80}")
    print(f"Processing {split.upper()}: {clip_name_to_remove}")
    print('=' * 80)
    
    # Load clips.json
    clips_file = output_path / split / "clips.json"
    with open(clips_file, 'r') as f:
        clips = json.load(f)
    
    # Find and remove the clip
    clip_to_remove = None
    for clip in clips:
        if clip.get("clip_name") == clip_name_to_remove:
            clip_to_remove = clip
            clips.remove(clip)
            break
    
    if clip_to_remove:
        print(f"‚úì Found clip in clips.json")
        
        # Delete associated files
        files_to_delete = {
            "audio": clip_to_remove.get("audio"),
            "video": clip_to_remove.get("video"),
            "text": clip_to_remove.get("text"),
            "audio_img": clip_to_remove.get("audio_img")
        }
        
        for file_type, file_path in files_to_delete.items():
            if file_path:
                # Extract filename from path
                filename = file_path.split("/")[-1]
                full_path = output_path / split / file_type / filename
                
                if full_path.exists():
                    full_path.unlink()  # Delete file
                    print(f"  ‚úì Deleted {file_type}: {filename}")
                else:
                    print(f"  ‚ö†Ô∏è  {file_type} file not found: {filename}")
        
        # Write updated clips.json
        with open(clips_file, 'w') as f:
            json.dump(clips, f, indent=2)
        
        print(f"‚úì Updated clips.json ({len(clips)} clips remaining)")
        total_removed += 1
    else:
        print(f"‚ùå Clip not found in clips.json")

print(f"\n{'=' * 80}")
print(f"SUMMARY: {total_removed}/3 corrupted clips removed")
print('=' * 80)


Removing corrupted clips from data...

Processing TRAIN: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024
‚úì Found clip in clips.json
  ‚úì Deleted audio: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.wav
  ‚úì Deleted video: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.mkv
  ‚úì Deleted text: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0024.txt
‚úì Updated clips.json (3112 clips remaining)

Processing TRAIN: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021
‚úì Found clip in clips.json
  ‚úì Deleted audio: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021.wav
  ‚úì Deleted video: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021.mkv
  ‚úì Deleted text: 20151107_2030_Stoke_City_1_0_Chelsea_half1_clip_0021.txt
‚úì Updated clips.json (3111 clips remaining)

Processing TEST: 20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710
‚úì Found clip in clips.json
  ‚úì Deleted audio: 20150926_1700_Manchester_United_3_0_Sunderland_half2_clip_0710.wav
  ‚ú