In [2]:
import yt_dlp
from pytubefix import Search, YouTube
import whisper
import torch
import os
import re

In [3]:
# Initialize Whisper model and check for CUDA support
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = whisper.load_model("turbo", device=device)

Using device: cuda


  checkpoint = torch.load(fp, map_location=device)


In [4]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millisecs = int((seconds - int(seconds)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"

In [6]:
def transcribe_with_whisper(video_title, model, language="en"):
    # Transcription function with SRT output
    result = model.transcribe(
        f"audio/{video_title}.wav",
        language=language,  # Language specified in global variable
        task="transcribe",
        fp16=torch.cuda.is_available(),  # Use float16 if on GPU
        verbose=False
    )

    # Extract the segments with timestamps and text
    segments = result["segments"]

    """Generate SRT formatted content from transcript segments"""
    srt_parts = []
    for i, segment in enumerate(segments, 1):
        start = format_timestamp(segment["start"])
        end = format_timestamp(segment["end"])
        text = segment["text"].strip()
        srt_parts.append(f"{i}\n{start} --> {end}\n{text}\n")
    srt_transcription = "\n".join(srt_parts)

    # Remove the trailing newline after the last subtitle (to avoid extra blank line)
    srt_transcription = srt_transcription.strip()

    # Save the SRT transcription to a file
    transcription_path = f"whisper_captions/{video_title}.srt"
    with open(transcription_path, "w",encoding="utf-8") as f:
        f.write(srt_transcription)  # Write without extra newlines

    print(f"SRT transcription generated with Whisper for {video_title}.")
    return transcription_path

In [None]:
def download_audio_and_transcription(query, num_videos=1, language="en"):
    # Ensure necessary directories exist
    os.makedirs("audio", exist_ok=True)
    os.makedirs("YT_captions", exist_ok=True)
    os.makedirs("whisper_captions", exist_ok=True)


    # Search for the video
    search = Search(query)
    videos = search.videos  # Use .videos instead of .results

    if not videos:
        print("No videos found for query.")
        return

    processed_count = 0 # number of processed videos
    i = 0  # Index for the video list

    # Iterate until we process the required number of videos
    while processed_count < num_videos:
        if i >= len(videos):  # If there are not enough videos, fetch more results
          search.get_next_results()
          videos = search.videos
          i = 0  # Reset the index for the new video list


        video = videos[i]
        yt = YouTube(video.watch_url)
        video_url = video.watch_url
        
        # Sanitize video title to use as filename
        video_title = re.sub(r"[^\w]", "_", video.title)
        audio_path = f"audio/{video_title}.wav"

        # Check if the video length is within the desired range (5 to 15 minutes)
        video_length = yt.length / 60  # Convert seconds to minutes
        if video_length < 5 or video_length > 15:
            print(f"Skipping video '{yt.title}' (Length: {int(video_length)} minutes)")
            i += 1  # Move to the next video
            continue

        # check of the audio file already exists
        if os.path.exists(audio_path):
            print(f"Audio for '{video_title}' already exists. Skipping download.")
            i += 1 # Move to the next video
            continue

        
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': f"audio/{video_title}.%(ext)s",  # Adjust the template to avoid double extensions
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'quiet': True,
        }


        # Download the audio
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print(f"Downloading audio for {video_title}...")
            ydl.download([video_url])

        # Check for YouTube transcription
        captions = video.captions
        if captions:
            for lang in captions:
                if lang.code == language:
                    transcript = lang.generate_srt_captions()
                    transcription_path = f"YT_captions/{video_title}.srt"
                    with open(transcription_path, "w",encoding="utf-8") as f:
                        f.write(transcript)
                    print(f"Downloaded YouTube transcription for {video_title}.")
                    break
            else:
                print(f"No language transcription available for {video_title}, generating with Whisper...")
                transcription_path = transcribe_with_whisper(video_title, model, language)
        else:
            print(f"No transcription available for {video_title}, generating with Whisper...")
            transcription_path = transcribe_with_whisper(video_title, model, language)

        print(f"Audio and transcription saved for {video_title}.")
        processed_count += 1
        i += 1

    print(f"Processed {processed_count} videos.")

In [12]:
# Usage
query = "دروس اونلاين" # search query on youtube
num_videos = 15  # Modify this number to download and transcribe N videos
language = "ar"  # Language code for YouTube transcription (if available)
download_audio_and_transcription(query, num_videos, language)

Skipping video 'كل طرق الربح من الانترنت تقريباً' (Length: 20 minutes)
Skipping video 'سر العلاقات الناجحة والتحرر من الإباحية والوصول للمتانة النفسية| د.عماد رشاد عثمان' (Length: 222 minutes)
Skipping video 'الإباحية والاستمناء والعادة السرية، كيف تحرر نفسك من إدمانهم | بودكاست دروس مع د.محمد عبدالجواد' (Length: 236 minutes)
Skipping video 'G4 / English (lesson 4.5) (writing) Ms.aml ehab' (Length: 43 minutes)
Skipping video 'كيف يتلاعبون بأفكارك' (Length: 16 minutes)
Skipping video 'الفراعنة ليسوا بناة الأهرامات؟ بودكاست دروس مع أحمد عدلي' (Length: 166 minutes)
Skipping video 'تاني أهم ساعة في اليوم' (Length: 16 minutes)
Skipping video 'كيف تنجز في هذا العالم المختل' (Length: 19 minutes)
Skipping video 'كيف تجعل حياتك روتينية ومملة مثلي؟' (Length: 24 minutes)
Skipping video 'كيف تنجح وأنت متخاذل ضعيف الإرادة؟' (Length: 17 minutes)
Skipping video 'ملخص أهم كورس حضرته في حياتي :  تعلم كيف تتعلم!' (Length: 38 minutes)
Audio for 'لماذا_نؤجل_سعادتنا_باستمرار_' already exists. Skipping down

  7%|▋         | 5150/78637 [00:05<01:17, 942.32frames/s] 


KeyboardInterrupt: 