# Install required packages
import subprocess
import sys

def install_package(package):
    try:
        __import__(package.split('[')[0])  # Handle packages with extras like torch[cuda]
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages for transcription
packages = ['faster-whisper', 'torch', 'torchaudio']
for package in packages:
    install_package(package)

print("All required packages are installed!")

In [None]:
import os
import glob
from pathlib import Path
from faster_whisper import WhisperModel
import time
from tqdm import tqdm
import torch

# Configuration
EPISODES_DIR = "history_of_rome_episodes"  # Directory with MP3 files
TRANSCRIPTS_DIR = "all_transcripts"    # Directory to save transcripts
MODEL_SIZE = "medium"  # Options: tiny, base, small, medium, large-v2, large-v3

# Create transcripts directory
os.makedirs(TRANSCRIPTS_DIR, exist_ok=True)

# Check CUDA availability and clear cache
def setup_device():
    """Setup the best available device for transcription"""
    # Force CPU mode to avoid GPU crashes
    print("🔧 Using CPU mode to avoid GPU crashes")
    print("This will be slower but more stable")
    return "cpu", "int8"
    
    # Original GPU detection code (commented out for stability)
    # if torch.cuda.is_available():
    #     print(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
    #     print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    #     
    #     # Clear CUDA cache to free up memory
    #     torch.cuda.empty_cache()
    #     
    #     try:
    #         # Test CUDA with a simple operation
    #         test_tensor = torch.tensor([1.0]).cuda()
    #         del test_tensor
    #         torch.cuda.empty_cache()
    #         
    #         print("CUDA test successful - using GPU")
    #         return "cuda", "float16"
    #     except Exception as e:
    #         print(f"CUDA test failed: {e}")
    #         print("Falling back to CPU")
    #         return "cpu", "int8"
    # else:
    #     print("CUDA not available - using CPU")
    #     return "cpu", "int8"

# Setup device
DEVICE, COMPUTE_TYPE = setup_device()

# Initialize the Whisper model with error handling
print(f"\nLoading Whisper model: {MODEL_SIZE}")
print(f"Device: {DEVICE}, Compute type: {COMPUTE_TYPE}")

try:
    model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
    print("✓ Model loaded successfully!")
except Exception as e:
    print(f"✗ Failed to load model with {DEVICE}: {e}")
    print("Trying with CPU fallback...")
    
    try:
        model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
        DEVICE = "cpu"
        COMPUTE_TYPE = "int8"
        print("✓ Model loaded successfully on CPU!")
    except Exception as e2:
        print(f"✗ Failed to load model on CPU: {e2}")
        print("Trying with smaller model...")
        model = WhisperModel("small", device="cpu", compute_type="int8")
        MODEL_SIZE = "small"
        DEVICE = "cpu"
        COMPUTE_TYPE = "int8"
        print("✓ Small model loaded successfully on CPU!")

# Find all MP3 files
mp3_files = glob.glob(os.path.join(EPISODES_DIR, "*.mp3"))
mp3_files.sort()  # Sort to process in order

print(f"\nFound {len(mp3_files)} MP3 files to transcribe")

if len(mp3_files) == 0:
    print(f"No MP3 files found in {EPISODES_DIR} directory!")
    print("Make sure you've downloaded the episodes first using the pull_episodes.ipynb notebook.")
else:
    print("First few files:")
    for i, file in enumerate(mp3_files[:5]):
        print(f"  {i+1}. {os.path.basename(file)}")
    if len(mp3_files) > 5:
        print(f"  ... and {len(mp3_files) - 5} more files")

# Performance estimates
if DEVICE == "cuda":
    print(f"\n🚀 GPU Performance estimate: ~2-4x real-time (30min episode = 7-15min)")
else:
    print(f"\n⏱️ CPU Performance estimate: ~0.5-1x real-time (30min episode = 30-60min)")
    
print(f"Total estimated time for all episodes: {len(mp3_files) * 0.5 if DEVICE == 'cuda' else len(mp3_files) * 2:.1f} hours")


  from .autonotebook import tqdm as notebook_tqdm


🔧 Using CPU mode to avoid GPU crashes
This will be slower but more stable

Loading Whisper model: medium
Device: cpu, Compute type: int8
✓ Model loaded successfully!

Found 192 MP3 files to transcribe
First few files:
  1. 20070728 - 001- In the Beginning.mp3
  2. 20100225 - 002- Youthful Indiscretions.mp3
  3. 20100225 - 003a- The Seven Kings of Rome.mp3
  4. 20100225 - 003b- The Seven Kings of Rome.mp3
  5. 20100225 - 004- The Public Thing.mp3
  ... and 187 more files

⏱️ CPU Performance estimate: ~0.5-1x real-time (30min episode = 30-60min)
Total estimated time for all episodes: 384.0 hours


In [None]:
def transcribe_episode(audio_file_path, output_file_path):
    """Transcribe a single episode and save with timestamps"""
    try:
        print(f"Transcribing: {os.path.basename(audio_file_path)}")
        start_time = time.time()
        
        # Simplified transcription settings to avoid crashes
        segments, info = model.transcribe(
            audio_file_path, 
            beam_size=3,  # Reduced beam size for stability
            word_timestamps=True,
            language="en",  # Force English for History of Rome
            # Removed VAD filter as it might cause issues
        )
        
        # Write results with timestamps
        with open(output_file_path, "w", encoding="utf-8") as f:
            # Write header with episode info
            episode_name = os.path.basename(audio_file_path).replace('.mp3', '')
            f.write(f"# {episode_name}\n")
            f.write(f"# Detected language: {info.language}\n")
            f.write(f"# Duration: {info.duration:.2f} seconds\n")
            f.write(f"# Model: {MODEL_SIZE}, Device: {DEVICE}\n\n")
            
            # Write timestamped segments
            segment_count = 0
            for segment in segments:
                # Format timestamps as [MM:SS --> MM:SS]
                start_min, start_sec = divmod(int(segment.start), 60)
                end_min, end_sec = divmod(int(segment.end), 60)
                
                timestamp = f"[{start_min:02d}:{start_sec:02d} --> {end_min:02d}:{end_sec:02d}]"
                line = f"{timestamp} {segment.text.strip()}"
                f.write(line + "\n")
                segment_count += 1
        
        elapsed_time = time.time() - start_time
        file_size = os.path.getsize(output_file_path) / 1024  # KB
        print(f"✓ Completed in {elapsed_time:.1f}s: {os.path.basename(output_file_path)} ({file_size:.1f} KB, {segment_count} segments)")
        return True
        
    except Exception as e:
        print(f"✗ Failed to transcribe {os.path.basename(audio_file_path)}: {str(e)}")
        # Remove partial file if it exists
        if os.path.exists(output_file_path):
            os.remove(output_file_path)
        return False

# Process all episodes
if len(mp3_files) > 0:
    print(f"\nStarting transcription of {len(mp3_files)} episodes...")
    print("=" * 70)
    
    successful_transcriptions = 0
    failed_transcriptions = 0
    total_start_time = time.time()
    
    for i, mp3_file in enumerate(mp3_files, 1):
        # Create output filename
        episode_name = os.path.basename(mp3_file).replace('.mp3', '')
        output_file = os.path.join(TRANSCRIPTS_DIR, f"{episode_name}.txt")
        
        # Skip if transcript already exists
        if os.path.exists(output_file):
            print(f"[{i}/{len(mp3_files)}] Skipping {episode_name} - transcript already exists")
            successful_transcriptions += 1
            continue
        
        print(f"\n[{i}/{len(mp3_files)}] Processing: {episode_name}")
        
        if transcribe_episode(mp3_file, output_file):
            successful_transcriptions += 1
        else:
            failed_transcriptions += 1
        
        # Small delay between files
        time.sleep(1)
    
    total_elapsed = time.time() - total_start_time
    
    print("\n" + "=" * 70)
    print(f"Transcription Summary:")
    print(f"✓ Successful: {successful_transcriptions}")
    print(f"✗ Failed: {failed_transcriptions}")
    print(f"⏱️ Total time: {total_elapsed/60:.1f} minutes")
    print(f"📁 Transcripts saved to: {os.path.abspath(TRANSCRIPTS_DIR)}")
    
    # List some completed transcripts
    transcript_files = [f for f in os.listdir(TRANSCRIPTS_DIR) if f.endswith('.txt')]
    print(f"\nCompleted transcripts ({len(transcript_files)} files):")
    for file in sorted(transcript_files)[:10]:  # Show first 10
        file_path = os.path.join(TRANSCRIPTS_DIR, file)
        file_size = os.path.getsize(file_path) / 1024  # Size in KB
        print(f"  • {file} ({file_size:.1f} KB)")
    if len(transcript_files) > 10:
        print(f"  ... and {len(transcript_files) - 10} more files")
else:
    print("No MP3 files to process!")



Starting transcription of 192 episodes...
[1/192] Skipping 20070728 - 001- In the Beginning - transcript already exists
[2/192] Skipping 20100225 - 002- Youthful Indiscretions - transcript already exists

[3/192] Processing: 20100225 - 003a- The Seven Kings of Rome
Transcribing: 20100225 - 003a- The Seven Kings of Rome.mp3


In [1]:
# GPU Troubleshooting and Process Management
import subprocess
import psutil

def check_gpu_processes():
    """Check what processes are using the GPU"""
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("Current GPU status:")
            print(result.stdout)
        else:
            print("nvidia-smi not available or failed")
    except FileNotFoundError:
        print("nvidia-smi not found - NVIDIA drivers may not be installed")

def kill_gpu_processes():
    """Kill processes that might be hogging the GPU"""
    gpu_processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
        try:
            # Look for common GPU-using processes
            if proc.info['name'] in ['python', 'jupyter-lab', 'jupyter', 'code']:
                cmdline = ' '.join(proc.info['cmdline'] or [])
                if any(keyword in cmdline.lower() for keyword in ['torch', 'cuda', 'gpu', 'whisper', 'ml', 'tensorflow']):
                    gpu_processes.append(proc)
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            continue
    
    if gpu_processes:
        print(f"Found {len(gpu_processes)} potential GPU processes:")
        for proc in gpu_processes:
            print(f"  PID {proc.info['pid']}: {proc.info['name']}")
        
        response = input("Kill these processes? (y/N): ").lower()
        if response == 'y':
            for proc in gpu_processes:
                try:
                    proc.terminate()
                    print(f"Terminated PID {proc.info['pid']}")
                except:
                    print(f"Failed to terminate PID {proc.info['pid']}")
    else:
        print("No obvious GPU processes found")

print("🔧 GPU Troubleshooting Tools")
print("=" * 40)
check_gpu_processes()
print("\n" + "=" * 40)
print("If you're getting CUDA errors, you can:")
print("1. Restart your Jupyter kernel (Kernel -> Restart)")
print("2. Run kill_gpu_processes() to free up GPU memory")
print("3. The script will automatically fall back to CPU if needed")

# Uncomment the next line if you want to automatically kill GPU processes
# kill_gpu_processes()


🔧 GPU Troubleshooting Tools
Current GPU status:
Sun Sep 28 11:24:38 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01             Driver Version: 535.247.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050        Off | 00000000:08:00.0  On |                  N/A |
|  0%   44C    P3              20W / 130W |    281MiB /  8192MiB |     22%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                    

In [None]:
# Test with a single episode first to make sure everything works
print("🧪 Testing with a single episode first...")

if len(mp3_files) > 0:
    test_file = mp3_files[0]  # First episode
    test_output = os.path.join(TRANSCRIPTS_DIR, f"TEST_{os.path.basename(test_file).replace('.mp3', '.txt')}")
    
    print(f"Testing with: {os.path.basename(test_file)}")
    
    if transcribe_episode(test_file, test_output):
        print("✅ Test successful! You can now run the full batch.")
        
        # Show a sample of the transcript
        if os.path.exists(test_output):
            with open(test_output, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                print("\n📄 Sample transcript:")
                for line in lines[:10]:  # Show first 10 lines
                    print(f"  {line.strip()}")
                if len(lines) > 10:
                    print(f"  ... and {len(lines) - 10} more lines")
    else:
        print("❌ Test failed. Check the error messages above.")
else:
    print("No MP3 files found for testing.")
