In [1]:
!pip install yt_dlp
!pip install pytubefix
!pip install git+https://github.com/openai/whisper.git
!sudo apt install ffmpeg

Collecting yt_dlp
  Downloading yt_dlp-2024.11.4-py3-none-any.whl.metadata (172 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2024.11.4-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp
Successfully installed yt_dlp-2024.11.4
Collecting pytubefix
  Downloading pytubefix-8.3.2-py3-none-any.whl.metadata (6.8 kB)
Downloading pytubefix-8.3.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.2/84.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytubefix
Successfully installed pytubefix-8.3.2
Collecting git+https://github.com/openai/whisper.git
  Cloning https://git

In [2]:
import yt_dlp
from pytubefix import Search, YouTube
import whisper
import torch
import os
import re

In [3]:
# Initialize Whisper model and check for CUDA support
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = whisper.load_model("turbo", device=device)

Using device: cuda


100%|█████████████████████████████████████| 1.51G/1.51G [00:24<00:00, 67.2MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [22]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millisecs = int((seconds - int(seconds)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
def transcribe_with_whisper(video_title, model, language="en"):
    # Transcription function with SRT output
    result = model.transcribe(
        f"audio/{video_title}.wav",
        language=language,  # Language specified in global variable
        task="transcribe",
        fp16=torch.cuda.is_available(),  # Use float16 if on GPU
        verbose=False
    )

    # Extract the segments with timestamps and text
    segments = result["segments"]

    """Generate SRT formatted content from transcript segments"""
    srt_parts = []
    for i, segment in enumerate(segments, 1):
        start = format_timestamp(segment["start"])
        end = format_timestamp(segment["end"])
        text = segment["text"].strip()
        srt_parts.append(f"{i}\n{start} --> {end}\n{text}\n")
    srt_transcription = "\n".join(srt_parts)

    # Remove the trailing newline after the last subtitle (to avoid extra blank line)
    srt_transcription = srt_transcription.strip()

    # Save the SRT transcription to a file
    transcription_path = f"whisper_captions/{video_title}.srt"
    with open(transcription_path, "w",encoding="utf-8") as f:
        f.write(srt_transcription)  # Write without extra newlines

    print(f"SRT transcription generated with Whisper for {video_title}.")
    return transcription_path

In [16]:
def download_audio_and_transcription(query, num_videos=1, language="en"):
    #check if the audio file exists in the directory
    if not os.path.exists("audio"):
        os.makedirs("audio")
    if not os.path.exists("YT_captions"):
        os.makedirs("YT_captions")
    if not os.path.exists("whisper_captions"):
        os.makedirs("whisper_captions")

    # Search for the video
    search = Search(query)
    videos = search.videos  # Use .videos instead of .results
    if not videos:
        print("No videos found for query.")
        return

    processed_count = 0 # number of processed videos
    i = 0  # Index for the video list

    # Iterate until we process the required number of videos
    while processed_count < num_videos:
        if i >= len(videos):  # If there are not enough videos, restart search or end
            print("Not enough videos found, stopping after processing.")
            break


        video = videos[i]
        yt = YouTube(video.watch_url)

        video_url = video.watch_url
        # Remove all invalid characters in a file name using regular expressions
        video_title = re.sub(r"[^\w]", "_", video.title)

        # Download the audio
        audio_path = f"audio/{video_title}.wav"
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': f"audio/{video_title}.%(ext)s",  # Adjust the template to avoid double extensions
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'quiet': True,
        }

        # Get the video length in minutes
        video_length = yt.length / 60  # Convert seconds to minutes

        # Check if the video length is within the desired range (5 to 15 minutes)
        if video_length < 5 or video_length > 20:
            print(f"Skipping video '{yt.title}' (Length: {int(video_length)} minutes)")
            i += 1  # Move to the next video
            continue
        if os.path.exists(audio_path):
            print(f"Audio for '{video_title}' already exists. Skipping download.")
            i += 1 # Move to the next video
            continue

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print(f"Downloading audio for {video_title}...")
            ydl.download([video_url])

        # Check for YouTube transcription
        captions = video.captions
        if captions:
            for lang in captions:
                if lang.code == language:
                    transcript = lang.generate_srt_captions()
                    transcription_path = f"YT_captions/{video_title}.srt"
                    with open(transcription_path, "w",encoding="utf-8") as f:
                        f.write(transcript)
                    print(f"Downloaded YouTube transcription for {video_title}.")
                    break
            else:
                print(f"No language transcription available for {video_title}, generating with Whisper...")
                transcription_path = transcribe_with_whisper(video_title, model, language)
        else:
            print(f"No transcription available for {video_title}, generating with Whisper...")
            transcription_path = transcribe_with_whisper(video_title, model, language)

        print(f"Audio and transcription saved for {video_title}.")
        processed_count += 1
        i += 1

    print(f"Processed {processed_count} videos.")

In [30]:
# Usage
query = "توك شو" # search query on youtube
num_videos = 15  # Modify this number to download and transcribe N videos
language = "ar"  # Language code for YouTube transcription (if available)
download_audio_and_transcription(query, num_videos, language)

Skipping video 'مشهد كوميدي بين افتكار وإبتكار من توك شو' (Length: 3 minutes)
Skipping video 'كيف تسوي توك شو ناجح...' (Length: 31 minutes)
Downloading audio for توك_شوو_6x02___السوق_والحريم...
No language transcription available for توك_شوو_6x02___السوق_والحريم, generating with Whisper...


100%|██████████| 103306/103306 [03:41<00:00, 465.97frames/s]


SRT transcription generated with Whisper for توك_شوو_6x02___السوق_والحريم.
Audio and transcription saved for توك_شوو_6x02___السوق_والحريم.
Downloading audio for توك_شو_كابتن_خالد_فطوطة_ودكتور_مصطفى_عنده___كامل...
No transcription available for توك_شو_كابتن_خالد_فطوطة_ودكتور_مصطفى_عنده___كامل, generating with Whisper...


 98%|█████████▊| 89276/90655 [03:25<00:03, 435.03frames/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-31-be3673adb257>", line 5, in <cell line: 5>
    download_audio_and_transcription(query, num_videos, language)
  File "<ipython-input-16-c915a9fa8734>", line 80, in download_audio_and_transcription
    transcription_path = transcribe_with_whisper(video_title, model, language)
  File "<ipython-input-26-7e8ad189a7ba>", line 3, in transcribe_with_whisper
    result = model.transcribe(
  File "/usr/local/lib/python3.10/dist-packages/whisper/transcribe.py", line 293, in transcribe
    result: DecodingResult = decode_with_fallback(mel_segment)
  File "/usr/local/lib/python3.10/dist-packages/whisper/transcribe.py", line 201, in decode_with_fallback
    decode_result = model.decode(segment, options)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line

TypeError: object of type 'NoneType' has no len()