In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Step 1: Install Required Packages (Run once per session)
!pip install transformers torchaudio pydub ffmpeg-python
!apt-get install -y ffmpeg

# Step 2: Import Libraries
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, pipeline
import torch
from pydub import AudioSegment
from pydub.utils import mediainfo
import os

# Step 3: Extract Audio from Video
def extract_audio_from_video(video_path, audio_output_path):
    # Load video and extract audio
    audio = AudioSegment.from_file(video_path, format="mp4")
    # Export as WAV (required for Whisper)
    audio.export(audio_output_path, format="wav")
    print(f"✅ Audio extracted to {audio_output_path}")

# Step 4: Transcribe Audio using Whisper (with long-form fix)
def transcribe_audio_with_whisper(audio_path):
    # Load pre-trained Whisper model and tokenizer
    model_name = "openai/whisper-base"  # Supports long audio with timestamps
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Set up pipeline
    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe = pipeline(
        task="automatic-speech-recognition",
        model=model,
        tokenizer=tokenizer,
        feature_extractor="openai/whisper-base",
        device=0 if device == "cuda" else -1,
        generate_kwargs={"return_timestamps": True},  # 🔑 Required for long audio
    )

    # Transcribe
    result = pipe(audio_path)
    print("📝 Transcription:")
    print(result["text"])
    return result["text"]

# Step 5: Full Workflow
def video_to_text(video_path):
    audio_path = "extracted_audio.wav"
    extract_audio_from_video(video_path, audio_path)
    transcription = transcribe_audio_with_whisper(audio_path)

    # Optional: Clean up audio file
    os.remove(audio_path)
    return transcription

# Step 6: Run the Pipeline
if __name__ == "__main__":
    # 📁 Update this path to your video file
    video_file = "/content/sample_data/videoplayback.mp4"  # Replace with your actual path

    # 🧪 Optional: Check if file exists
    if os.path.exists(video_file):
        final_text = video_to_text(video_file)
        print("\n🎉 Final Transcription:\n", final_text)
    else:
        print("❌ File NOT found. Check the path:", video_file)

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


📝 Transcription:
 These are the five steps I always use when building AI agents and they've helped me generate over $240,000 in the past six months. In this video, I'm going to be giving you the exact framework that I use so you can start to build some powerful AI systems even if you're not technical. I use this method to build all of the automations you've seen on my channel for the past seven months, but I've never really talked about the actual methodology. So let's get into it. Starting with step one is the foundations. Don't run before you walk. I see way too many people trying to jump into a very complex AI agent system, but they I still don't understand how to set up an API call. So before I started building agents, I wanted to make sure I understood the foundational elements. So I'm sure we all know what large language models are, but it's really important to at least at a high level understand how they work. You also want to understand some data foundations, APIs and HTTP requ

# New Section