In [None]:
import yt_dlp
from pytubefix import Search
import whisper
import torch
import os

In [None]:
def format_time_to_srt(seconds):
    """Helper function to convert time in seconds to SRT format (HH:MM:SS,MS)"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"

In [None]:
def transcribe_with_whisper(audio_path, model):
    # Transcription function with SRT output
    result = model.transcribe(
        audio_path,
        language="en",  # Language specified in global variable
        task="transcribe",
        fp16=torch.cuda.is_available()  # Use float16 if on GPU
    )

    # Extract the segments with timestamps and text
    segments = result["segments"]

    # Prepare the SRT formatted transcription
    srt_transcription = ""
    for i, segment in enumerate(segments, 1):
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]
        
        # Convert times from seconds to SRT format (HH:MM:SS,MS)
        start_time_srt = format_time_to_srt(start_time)
        end_time_srt = format_time_to_srt(end_time)
        
        # Append each segment in SRT format, ensuring no extra blank lines
        srt_transcription += f"{i}\n{start_time_srt} --> {end_time_srt}\n{text}\n\n"

    # Save the SRT transcription to a file
    transcription_path = f"{os.path.splitext(audio_path)[0]}_transcription.srt"
    with open(transcription_path, "w") as f:
        f.write(srt_transcription.strip())  # Strip only the final extra newline
    
    print(f"SRT transcription generated with Whisper for {audio_path}.")
    return transcription_path


In [None]:
def download_audio_and_transcription(query, num_videos=1):
    # Initialize Whisper model and check for CUDA support
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model = whisper.load_model("base", device=device)
    
    # Search for the video
    search = Search(query)
    videos = search.videos  # Use .videos instead of .results
    if not videos:
        print("No videos found for query.")
        return
    
    # Process the first N videos
    for i, video in enumerate(videos[:num_videos]):
        video_url = video.watch_url
        video_title = video.title.replace(" ", "_")

        # Download the audio
        audio_path = f"{video_title}.mp3"
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': f"{video_title}.%(ext)s",  # Adjust the template to avoid double extensions
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'quiet': True,
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print(f"Downloading audio for {video_title}...")
            ydl.download([video_url])
        
        # Check for YouTube transcription
        captions = video.captions
        if captions:
            for lang in captions:
                if lang.code == 'en':
                    transcript = lang.generate_srt_captions()
                    transcription_path = f"{video_title}_transcription.txt"
                    with open(transcription_path, "w") as f:
                        f.write(transcript)
                    print(f"Downloaded YouTube transcription for {video_title}.")
                    break
            else:
                print(f"No English transcription available for {video_title}, generating with Whisper...")
                transcription_path = transcribe_with_whisper(audio_path, model)
        else:
            print(f"No transcription available for {video_title}, generating with Whisper...")
            transcription_path = transcribe_with_whisper(audio_path, model)
        
        print(f"Audio and transcription saved for {video_title}.")
    
    print(f"Processed {num_videos} videos.")

In [None]:
# Usage
query = "mac mini"
num_videos = 3  # Modify this number to download and transcribe N videos
download_audio_and_transcription(query, num_videos)