To bypass YouTube security, you need to manually extract your cookies.txt to paste into the project root.

In [0]:
%pip install yt-dlp imageio-ffmpeg openai-whisper torch
%restart_python

In [0]:
import yt_dlp
import imageio_ffmpeg
import os
import subprocess
import shutil
from IPython.display import display, FileLink
import whisper
import whisper.audio
import numpy as np
import tempfile
import torch
import subprocess

# Set up paths
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()

# Constants
youtube_url = "https://www.youtube.com/watch?v=oG-YaIlIj-4"
workspace_dir = "/Workspace/Users/User"
local_tmp_path = "/tmp/text.txt"
cookies_file = os.path.join(workspace_dir, "cookies", "cookies.txt")
model = whisper.load_model("base")

In [0]:
def download_youtube_audio_yt_dlp(url, output_dir=workspace_dir, audio_format="m4a"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
        'cookiefile': cookies_file,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': audio_format,
            'preferredquality': '192',
        }],
        'ffmpeg_location': ffmpeg_path,
        'quiet': False,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        audio_path = ydl.prepare_filename(info).rsplit(".", 1)[0] + f".{audio_format}"
        return audio_path

def convert_to_wav_ffmpeg(input_path):
    output_path = input_path.rsplit(".", 1)[0] + ".wav"
    subprocess.run([ffmpeg_path, "-y", "-i", input_path, output_path], check=True)
    return output_path

def load_audio_with_custom_ffmpeg(file: str, sr: int = 16000):
    with tempfile.NamedTemporaryFile(suffix=".raw") as f:
        subprocess.run([
            ffmpeg_path,
            "-nostdin", "-threads", "0", "-i", file,
            "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le",
            "-ar", str(sr), "-"
        ], stdout=f, stderr=subprocess.DEVNULL, check=True)
        f.seek(0)
        raw = f.read()

    audio = np.frombuffer(raw, np.int16).flatten().astype(np.float32) / 32768.0
    return audio
whisper.audio.load_audio = load_audio_with_custom_ffmpeg

def main():
    print("\n========================================")
    print("Check if ffmpeg exists:", os.path.exists(ffmpeg_path))
    print("Check if ffmpeg is executable:", os.access(ffmpeg_path, os.X_OK))

    try:
        audio_file_path = download_youtube_audio_yt_dlp(youtube_url)
        print(f"\n✅ Downloaded audio: {audio_file_path}")

        wav_path = convert_to_wav_ffmpeg(audio_file_path)
        print(f"\n✅ Converted to WAV: {wav_path}")

        transcribed_text = model.transcribe(wav_path)["text"]
        print("\n✅ Transcription:")
        print(transcribed_text)

        with open(f"{workspace_dir}/text.txt", "w", encoding="utf-8") as f:
            f.write(transcribed_text)
        print(f"\n✅ Saved transcription to {workspace_dir}/text.txt")

    finally:
        for f in [audio_file_path, wav_path]:
            if f and os.path.exists(f):
                try:
                    os.remove(f)
                    print(f"Deleted: {f}")
                except Exception as e:
                    print(f"Failed to delete {f}: {e}")


if __name__ == "__main__":
    main() 



Check if ffmpeg exists: True
Check if ffmpeg is executable: True
[youtube] Extracting URL: https://www.youtube.com/watch?v=j6QxHdQ67YU
[youtube] j6QxHdQ67YU: Downloading webpage
[youtube] j6QxHdQ67YU: Downloading tv client config
[youtube] j6QxHdQ67YU: Downloading tv player API JSON
[info] j6QxHdQ67YU: Downloading 1 format(s): 251
[download] Destination: /Workspace/Users/chongjinjye@gmail.com/What is Databricks？ ｜ Introduction to Databricks ｜ Edureka.webm
[download]   0.0% of    6.72MiB at   65.70KiB/s ETA 01:44[download]   0.0% of    6.72MiB at  155.09KiB/s ETA 00:44[download]   0.1% of    6.72MiB at  302.94KiB/s ETA 00:22[download]   0.2% of    6.72MiB at  609.42KiB/s ETA 00:11[download]   0.5% of    6.72MiB at    1.10MiB/s ETA 00:06[download]   0.9% of    6.72MiB at    1.96MiB/s ETA 00:03[download]   1.8% of    6.72MiB at    2.28MiB/s ETA 00:02[download]   3.7% of    6.72MiB at    3.61MiB/s ETA 00:01[download]   7.4% of    6.72MiB at    5.89MiB/s ETA 00:01[download]  14.9

ffmpeg version 7.0.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 8 (Debian 8.3.0-6)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librubberband --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libvorbis --enable-libopus --enable-libtheora --enable-libvidstab --enable-libvo-amrwbenc --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libdav1d --enable-libxvid --enable-libzimg
  libavutil      59.  8.100 / 59.  8.100
  libavcodec     61.  3.100 / 61.  3.100
  libavformat    61.  1.100 / 61.  1.100
  libavde


✅ Converted to WAV: /Workspace/Users/chongjinjye@gmail.com/What is Databricks？ ｜ Introduction to Databricks ｜ Edureka.wav

✅ Transcription:
 Imagine you have puzzled with 1000 of pieces and you need to put them together to create a beautiful picture. That's what data can be like. Lots of little pieces that need to be assembled to make sense. Data breaks is like your puzzle solving buddy that helps you to put all those pieces together. Hello and welcome back to our YouTube channel. If you are joining us for the first time, don't forget to hit the subscribe button and the bell icon so you won't miss out any of our exciting content. And also I will suggest you to take up the Apache Spark training course if you are interested in this topic. The link is present in the description below. Now let's start with the topic of our video. What is data breaks? But wait, before that first we need to move on to data breaks. Data breaks were founded by the creators of Apache Spark which is an open sou

In [0]:
dbutils.notebook.exit("done")