# Whisper YouTube Transcript Extractor - Step by Step
# Run each cell individually to understand what's happening

In [13]:
# ============================================================================
# CELL 1: Import Required Libraries
# ============================================================================

import whisper
import yt_dlp
import os
import json
import tempfile
from pathlib import Path
import pandas as pd

print("✅ All libraries imported successfully!")
print("📦 Whisper version:", whisper.__version__ if hasattr(whisper, '__version__') else "Available")

✅ All libraries imported successfully!
📦 Whisper version: 20240930


In [14]:
# ============================================================================
# CELL 2: Test Video Configuration
# ============================================================================

# Test with the NetworkChuck video that worked before
TEST_VIDEO_URL = "https://www.youtube.com/watch?v=5-5Mf_L0UKw"  # Python If/Else video
VIDEO_ID = "5-5Mf_L0UKw"

print(f"🎬 Test video: {TEST_VIDEO_URL}")
print(f"📍 Video ID: {VIDEO_ID}")

🎬 Test video: https://www.youtube.com/watch?v=5-5Mf_L0UKw
📍 Video ID: 5-5Mf_L0UKw


In [15]:
# ============================================================================
# CELL 3: Setup Directories
# ============================================================================

# Create directories for our outputs
CACHE_DIR = Path("../data/test/transcript_cache")
CSV_DIR = Path("../data/test/transcripts_csv")

# Create directories if they don't exist
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CSV_DIR.mkdir(parents=True, exist_ok=True)

print(f"📁 Cache directory: {CACHE_DIR}")
print(f"📁 CSV directory: {CSV_DIR}")
print(f"✅ Directories ready!")

📁 Cache directory: ..\data\test\transcript_cache
📁 CSV directory: ..\data\test\transcripts_csv
✅ Directories ready!


In [16]:
# ============================================================================
# CELL 4: Initialize Whisper Model
# ============================================================================

print("🎤 Loading Whisper model...")
print("⏳ This might take a moment the first time...")

# Load Whisper model (start with 'tiny' for speed)
model = whisper.load_model("small")

print("✅ Whisper model loaded successfully!")
print(f"📊 Model type: tiny (fast but less accurate)")
print("💡 You can change to 'base', 'small', 'medium', or 'large' for better quality")

🎤 Loading Whisper model...
⏳ This might take a moment the first time...


100%|███████████████████████████████████████| 461M/461M [00:42<00:00, 11.5MiB/s]


✅ Whisper model loaded successfully!
📊 Model type: tiny (fast but less accurate)
💡 You can change to 'base', 'small', 'medium', or 'large' for better quality


In [17]:
# ============================================================================
# CELL 5: Get Video Information
# ============================================================================

print(f"📱 Getting video information for: {TEST_VIDEO_URL}")

# Configure yt-dlp to get video info only
ydl_opts = {
    'quiet': True,
    'no_warnings': True,
}

try:
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        video_info = ydl.extract_info(TEST_VIDEO_URL, download=False)
    
    print("✅ Video information retrieved!")
    print(f"🎬 Title: {video_info.get('title', 'Unknown')}")
    print(f"👤 Uploader: {video_info.get('uploader', 'Unknown')}")
    print(f"⏱️ Duration: {video_info.get('duration', 0)} seconds")
    print(f"📅 Upload date: {video_info.get('upload_date', 'Unknown')}")
    
except Exception as e:
    print(f"❌ Error getting video info: {e}")
    video_info = None

📱 Getting video information for: https://www.youtube.com/watch?v=5-5Mf_L0UKw
✅ Video information retrieved!
🎬 Title: If Else Statements in Python // Python RIGHT NOW!! // EP 4
👤 Uploader: NetworkChuck
⏱️ Duration: 859 seconds
📅 Upload date: 20220207


In [18]:
# ============================================================================
# CELL 6: Download Audio
# ============================================================================

print("📥 Downloading audio...")
print("⏳ This will take a moment depending on video length...")

# Create temporary directory for audio
temp_dir = tempfile.mkdtemp()
audio_file = None

try:
    # Configure yt-dlp for audio download
    audio_opts = {
        'format': 'bestaudio[ext=webm]/bestaudio/best',
        'outtmpl': os.path.join(temp_dir, f'{VIDEO_ID}.%(ext)s'),
        'quiet': False,  # Show download progress
        'no_warnings': False,
    }
    
    with yt_dlp.YoutubeDL(audio_opts) as ydl:
        ydl.download([TEST_VIDEO_URL])
    
    # Find the downloaded audio file
    for ext in ['webm', 'm4a', 'mp4', 'wav', 'mp3']:
        potential_file = os.path.join(temp_dir, f'{VIDEO_ID}.{ext}')
        if os.path.exists(potential_file):
            audio_file = potential_file
            break
    
    if audio_file:
        file_size = os.path.getsize(audio_file) / (1024 * 1024)  # Size in MB
        print(f"✅ Audio downloaded successfully!")
        print(f"📁 File: {audio_file}")
        print(f"📊 Size: {file_size:.1f} MB")
    else:
        print("❌ No audio file found after download")
        
except Exception as e:
    print(f"❌ Error downloading audio: {e}")

📥 Downloading audio...
⏳ This will take a moment depending on video length...
[youtube] Extracting URL: https://www.youtube.com/watch?v=5-5Mf_L0UKw
[youtube] 5-5Mf_L0UKw: Downloading webpage
[youtube] 5-5Mf_L0UKw: Downloading tv client config
[youtube] 5-5Mf_L0UKw: Downloading tv player API JSON
[youtube] 5-5Mf_L0UKw: Downloading ios player API JSON
[youtube] 5-5Mf_L0UKw: Downloading m3u8 information
[info] 5-5Mf_L0UKw: Downloading 1 format(s): 251
[download] Destination: C:\Users\jeand\AppData\Local\Temp\tmp6yjflt3u\5-5Mf_L0UKw.webm
[download] 100% of   11.98MiB in 00:00:01 at 7.95MiB/s     
✅ Audio downloaded successfully!
📁 File: C:\Users\jeand\AppData\Local\Temp\tmp6yjflt3u\5-5Mf_L0UKw.webm
📊 Size: 12.0 MB


In [19]:
# ============================================================================
# CELL 7: Transcribe with Whisper
# ============================================================================

if audio_file and os.path.exists(audio_file):
    print("🎤 Starting Whisper transcription...")
    print("⏳ This will take a few minutes...")
    
    try:
        # Transcribe the audio
        result = model.transcribe(audio_file)
        
        print("✅ Transcription completed!")
        print(f"🗣️ Detected language: {result['language']}")
        print(f"📝 Number of segments: {len(result['segments'])}")
        
        # Show first few segments as preview
        print("\n📄 Preview (first 3 segments):")
        print("-" * 50)
        for i, segment in enumerate(result['segments'][:3]):
            start = segment['start']
            end = segment['end']
            text = segment['text'].strip()
            print(f"[{start:06.2f} - {end:06.2f}] {text}")
        
        if len(result['segments']) > 3:
            print(f"... and {len(result['segments']) - 3} more segments")
            
    except Exception as e:
        print(f"❌ Error during transcription: {e}")
        result = None
else:
    print("❌ No audio file available for transcription")
    result = None

🎤 Starting Whisper transcription...
⏳ This will take a few minutes...




✅ Transcription completed!
🗣️ Detected language: en
📝 Number of segments: 260

📄 Preview (first 3 segments):
--------------------------------------------------
[000.00 - 003.90] We're about to use one of the most powerful and fun concepts in Python.
[003.94 - 006.94] And we're going to use that power to prevent Ben, evil Ben
[008.54 - 012.10] from entering our coffee shop because we don't want Ben here. Get out of here, Ben.
... and 257 more segments


In [20]:
# ============================================================================
# CELL 8: Save Transcript as JSON
# ============================================================================

if result:
    # Prepare transcript data
    transcript_data = {
        'video_id': VIDEO_ID,
        'video_url': TEST_VIDEO_URL,
        'video_info': {
            'title': video_info.get('title', 'Unknown') if video_info else 'Unknown',
            'uploader': video_info.get('uploader', 'Unknown') if video_info else 'Unknown',
            'duration': video_info.get('duration', 0) if video_info else 0,
            'upload_date': video_info.get('upload_date', 'Unknown') if video_info else 'Unknown',
        },
        'whisper_result': result,
        'language': result['language'],
        'segments': result['segments'],
        'full_text': result['text']
    }
    
    # Save to JSON file
    json_file = CACHE_DIR / f"{VIDEO_ID}.json"
    
    try:
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(transcript_data, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Transcript saved to JSON!")
        print(f"📁 File: {json_file}")
        print(f"📊 File size: {os.path.getsize(json_file) / 1024:.1f} KB")
        
    except Exception as e:
        print(f"❌ Error saving JSON: {e}")

✅ Transcript saved to JSON!
📁 File: ..\data\test\transcript_cache\5-5Mf_L0UKw.json
📊 File size: 362.1 KB


In [21]:
# ============================================================================
# CELL 9: Load and Inspect JSON File
# ============================================================================

# Load the JSON file we just created
json_file = CACHE_DIR / f"{VIDEO_ID}.json"

if json_file.exists():
    print("🔍 Loading and inspecting saved JSON file...")
    
    with open(json_file, 'r', encoding='utf-8') as f:
        loaded_data = json.load(f)
    
    print("✅ JSON file loaded successfully!")
    print("\n📊 JSON Structure:")
    print("-" * 30)
    for key in loaded_data.keys():
        if key == 'segments':
            print(f"📝 {key}: {len(loaded_data[key])} items")
        elif key == 'full_text':
            print(f"📄 {key}: {len(loaded_data[key])} characters")
        else:
            print(f"🏷️ {key}: {type(loaded_data[key]).__name__}")
    
    # Show video info
    print(f"\n🎬 Video Information:")
    print(f"   Title: {loaded_data['video_info']['title']}")
    print(f"   Duration: {loaded_data['video_info']['duration']} seconds")
    print(f"   Language: {loaded_data['language']}")
    
    # Show text preview
    full_text = loaded_data['full_text']
    print(f"\n📄 Full Text Preview (first 200 characters):")
    print(f"'{full_text[:200]}...'")
    
else:
    print("❌ JSON file not found!")

🔍 Loading and inspecting saved JSON file...
✅ JSON file loaded successfully!

📊 JSON Structure:
------------------------------
🏷️ video_id: str
🏷️ video_url: str
🏷️ video_info: dict
🏷️ whisper_result: dict
🏷️ language: str
📝 segments: 260 items
📄 full_text: 16009 characters

🎬 Video Information:
   Title: If Else Statements in Python // Python RIGHT NOW!! // EP 4
   Duration: 859 seconds
   Language: en

📄 Full Text Preview (first 200 characters):
' We're about to use one of the most powerful and fun concepts in Python. And we're going to use that power to prevent Ben, evil Ben from entering our coffee shop because we don't want Ben here. Get ou...'


In [22]:
# ============================================================================
# CELL 10: Convert to CSV Format
# ============================================================================

if json_file.exists():
    print("📊 Converting to CSV format...")
    
    # Create DataFrame from segments
    segments_data = []
    for i, segment in enumerate(loaded_data['segments']):
        segments_data.append({
            'segment_id': i,
            'start_time': segment['start'],
            'end_time': segment['end'],
            'duration': segment['end'] - segment['start'],
            'text': segment['text'].strip(),
            'video_id': loaded_data['video_id'],
            'video_title': loaded_data['video_info']['title'],
            'video_url': loaded_data['video_url']
        })
    
    # Create DataFrame
    df = pd.DataFrame(segments_data)
    
    # Save to CSV
    csv_file = CSV_DIR / f"transcript_{VIDEO_ID}.csv"
    df.to_csv(csv_file, index=False, encoding='utf-8')
    
    print(f"✅ CSV file created!")
    print(f"📁 File: {csv_file}")
    print(f"📊 Rows: {len(df)}")
    
    # Show preview of CSV
    print(f"\n📄 CSV Preview (first 3 rows):")
    print(df.head(3)[['start_time', 'end_time', 'text']].to_string())

📊 Converting to CSV format...
✅ CSV file created!
📁 File: ..\data\test\transcripts_csv\transcript_5-5Mf_L0UKw.csv
📊 Rows: 260

📄 CSV Preview (first 3 rows):
   start_time  end_time                                                                                 text
0        0.00      3.90              We're about to use one of the most powerful and fun concepts in Python.
1        3.94      6.94                           And we're going to use that power to prevent Ben, evil Ben
2        8.54     12.10  from entering our coffee shop because we don't want Ben here. Get out of here, Ben.


In [23]:
# ============================================================================
# CELL 11: Cleanup Temporary Files
# ============================================================================

# Clean up the temporary audio file
if audio_file and os.path.exists(audio_file):
    try:
        os.remove(audio_file)
        os.rmdir(temp_dir)
        print("🗑️ Temporary audio file cleaned up")
    except:
        print("⚠️ Could not clean up temporary files")

print("\n🎉 Complete! You now have:")
print(f"   📄 JSON transcript: {CACHE_DIR}/{VIDEO_ID}.json")
print(f"   📊 CSV transcript: {CSV_DIR}/transcript_{VIDEO_ID}.csv")

🗑️ Temporary audio file cleaned up

🎉 Complete! You now have:
   📄 JSON transcript: ..\data\test\transcript_cache/5-5Mf_L0UKw.json
   📊 CSV transcript: ..\data\test\transcripts_csv/transcript_5-5Mf_L0UKw.csv


In [24]:
# ============================================================================
# CELL 12: Summary and Next Steps
# ============================================================================

print("\n🎯 SUMMARY:")
print("="*50)
print("✅ Whisper successfully extracted transcript")
print("✅ Saved in both JSON and CSV formats")
print("✅ Ready to process more videos!")

print("\n🚀 NEXT STEPS:")
print("1. 🔍 Examine the JSON and CSV files")
print("2. 📹 Add more NetworkChuck video URLs")
print("3. 🔄 Create batch processing script")
print("4. 🤖 Build the AI chatbot with embeddings")

print("\n💡 TIP: You can now modify the VIDEO_URL and re-run cells 5-11 to process different videos!")


🎯 SUMMARY:
✅ Whisper successfully extracted transcript
✅ Saved in both JSON and CSV formats
✅ Ready to process more videos!

🚀 NEXT STEPS:
1. 🔍 Examine the JSON and CSV files
2. 📹 Add more NetworkChuck video URLs
3. 🔄 Create batch processing script
4. 🤖 Build the AI chatbot with embeddings

💡 TIP: You can now modify the VIDEO_URL and re-run cells 5-11 to process different videos!
