# YouTube Video Transcription Pipeline
## Optimized for Kaggle - No OOM Errors

This notebook:
- Downloads YouTube videos
- Extracts audio
- Transcribes using OpenAI Whisper (memory-efficient)
- Creates timestamped utterances
- Exports JSON output

In [None]:
# ====================================================================
# CELL 1: Install Dependencies
# ====================================================================
print("üì¶ Installing dependencies...")
!pip install -q yt-dlp openai-whisper
print("‚úÖ Dependencies installed!")

In [None]:
# ====================================================================
# CELL 2: Import Libraries
# ====================================================================
import os
import json
import subprocess
import gc
from pathlib import Path
import whisper
import torch

print(f"üñ•Ô∏è  Device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Clear any existing cache
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# ====================================================================
# CELL 3: Configuration
# ====================================================================
# ‚ö†Ô∏è CHANGE THIS to your YouTube video URL
VIDEO_URL = "https://youtu.be/dQw4w9WgXcQ"  # Replace with your video

# Model size options (smaller = less memory):
# "tiny"   - ~1GB VRAM, fastest, least accurate
# "base"   - ~1GB VRAM, fast, decent accuracy  ‚úÖ RECOMMENDED for Kaggle
# "small"  - ~2GB VRAM, good accuracy
# "medium" - ~5GB VRAM, better accuracy
# "large"  - ~10GB VRAM, best accuracy (may OOM on Kaggle)

CONFIG = {
    "model_size": "base",  # Change to "small" or "medium" if you have enough memory
    "language": "en",      # "en" for English, "es" for Spanish, None for auto-detect
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "output_dir": "/kaggle/working/output"
}

# Create directories
os.makedirs("/kaggle/working/videos", exist_ok=True)
os.makedirs("/kaggle/working/audio", exist_ok=True)
os.makedirs(CONFIG["output_dir"], exist_ok=True)

print("‚úÖ Configuration loaded")
print(f"   Model: {CONFIG['model_size']}")
print(f"   Language: {CONFIG['language'] or 'auto-detect'}")
print(f"   Device: {CONFIG['device']}")

In [None]:
# ====================================================================
# CELL 4: Download Video
# ====================================================================
def download_youtube_video(url: str):
    """Download YouTube video using yt-dlp."""
    print(f"üì• Downloading video from: {url}")
    
    import yt_dlp
    
    output_dir = "/kaggle/working/videos"
    
    ydl_opts = {
        'format': 'best[ext=mp4]/best',
        'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
        'quiet': True,
        'no_warnings': True,
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_id = info['id']
        video_title = info.get('title', 'Unknown')
        duration = info.get('duration', 0)
        video_path = os.path.join(output_dir, f"{video_id}.mp4")
    
    print(f"‚úÖ Video downloaded: {video_title}")
    print(f"   Duration: {duration//60}m {duration%60}s")
    return video_path, video_id, video_title, duration

# Download the video
video_path, video_id, video_title, video_duration = download_youtube_video(VIDEO_URL)

In [None]:
# ====================================================================
# CELL 5: Extract Audio
# ====================================================================
def extract_audio(video_path: str) -> str:
    """Extract audio from video using ffmpeg."""
    print(f"üéµ Extracting audio from: {Path(video_path).name}")
    
    video_name = Path(video_path).stem
    audio_path = f"/kaggle/working/audio/{video_name}.mp3"
    
    # Extract as MP3 (Whisper handles this well)
    cmd = [
        'ffmpeg', '-i', video_path,
        '-vn', '-acodec', 'libmp3lame',
        '-ar', '16000', '-ac', '1',
        '-b:a', '64k',
        '-y', audio_path
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise Exception(f"FFmpeg error: {result.stderr}")
    
    file_size = os.path.getsize(audio_path) / (1024*1024)
    print(f"‚úÖ Audio extracted: {file_size:.2f} MB")
    return audio_path

# Extract audio
audio_path = extract_audio(video_path)

In [None]:
# ====================================================================
# CELL 6: Transcribe Audio with Whisper
# ====================================================================
def transcribe_with_whisper(audio_path: str, model_size: str, language: str = None):
    """Transcribe audio using Whisper (memory-efficient)."""
    print(f"üé§ Loading Whisper model: {model_size}")
    
    # Load model with FP16 if on GPU (saves memory)
    model = whisper.load_model(
        model_size,
        device=CONFIG['device']
    )
    
    print(f"üéØ Transcribing audio...")
    print("   This may take a few minutes depending on video length")
    
    # Transcribe with options
    result = model.transcribe(
        audio_path,
        language=language,
        fp16=torch.cuda.is_available(),  # Use FP16 on GPU (faster, less memory)
        verbose=False,
        word_timestamps=True  # Get word-level timestamps
    )
    
    # Clean up model to free memory
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    print(f"‚úÖ Transcription complete!")
    print(f"   Detected language: {result['language']}")
    print(f"   Segments: {len(result['segments'])}")
    
    return result

# Transcribe
transcription = transcribe_with_whisper(
    audio_path,
    CONFIG['model_size'],
    CONFIG['language']
)

In [None]:
# ====================================================================
# CELL 7: Process Results
# ====================================================================
def create_utterances(transcription_result):
    """Convert Whisper segments to utterance format."""
    utterances = []
    
    for segment in transcription_result['segments']:
        utterances.append({
            "text": segment['text'].strip(),
            "start_ms": int(segment['start'] * 1000),
            "end_ms": int(segment['end'] * 1000),
            "confidence": segment.get('confidence', 0.0),
            "speaker": "default",
            "words": segment.get('words', [])  # Word-level timestamps if available
        })
    
    return utterances

# Create utterances
utterances = create_utterances(transcription)
full_transcript = transcription['text']

print(f"‚úÖ Created {len(utterances)} utterances")
print(f"\nüìù Transcript preview (first 500 chars):")
print("="*70)
print(full_transcript[:500] + "..." if len(full_transcript) > 500 else full_transcript)
print("="*70)

In [None]:
# ====================================================================
# CELL 8: Save Results
# ====================================================================
# Prepare final output
result = {
    "video_id": video_id,
    "video_title": video_title,
    "video_url": VIDEO_URL,
    "video_path": video_path,
    "audio_path": audio_path,
    "duration_ms": video_duration * 1000,
    "full_transcript": full_transcript,
    "utterances": utterances,
    "utterance_count": len(utterances),
    "model_used": f"whisper-{CONFIG['model_size']}",
    "language": transcription['language']
}

# Save to JSON
output_file = f"{CONFIG['output_dir']}/transcript_{video_id}.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

# Also save just the text
text_file = f"{CONFIG['output_dir']}/transcript_{video_id}.txt"
with open(text_file, 'w', encoding='utf-8') as f:
    f.write(full_transcript)

file_size = os.path.getsize(output_file) / (1024*1024)

print("\n" + "="*70)
print("‚úÖ TRANSCRIPTION COMPLETE!")
print("="*70)
print(f"üìä Results:")
print(f"   - Video: {video_title}")
print(f"   - Duration: {video_duration//60}m {video_duration%60}s")
print(f"   - Language: {transcription['language']}")
print(f"   - Utterances: {len(utterances)}")
print(f"   - Transcript length: {len(full_transcript)} characters")
print(f"\nüíæ Files saved:")
print(f"   - JSON: {output_file} ({file_size:.2f} MB)")
print(f"   - Text: {text_file}")
print("="*70)

In [None]:
# ====================================================================
# CELL 9: Display Sample Results
# ====================================================================
print("\nüìÑ First 5 utterances:")
print("="*70)
for i, utt in enumerate(utterances[:5], 1):
    start_time = utt['start_ms'] / 1000
    end_time = utt['end_ms'] / 1000
    print(f"\n[{i}] {start_time:.1f}s - {end_time:.1f}s")
    print(f"    {utt['text']}")
    if utt.get('confidence'):
        print(f"    Confidence: {utt['confidence']:.2f}")

print("\n" + "="*70)
print("‚úÖ All done! Download your files from /kaggle/working/output/")
print("="*70)

In [None]:
# ====================================================================
# CELL 10: Optional - Clean Up Large Files
# ====================================================================
# Uncomment if you want to delete video/audio to save space

# import os
# if os.path.exists(video_path):
#     os.remove(video_path)
#     print(f"üóëÔ∏è  Deleted video: {video_path}")
# if os.path.exists(audio_path):
#     os.remove(audio_path)
#     print(f"üóëÔ∏è  Deleted audio: {audio_path}")

print("üí° Tip: Keep video/audio files if you need them for later processing")