In [None]:
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List
import logging
import yt_dlp
from pytubefix import Search
import whisper
import torch
import re

In [None]:
@dataclass
class VideoInfo:
    title: str
    url: str
    safe_title: str
    
    @classmethod
    def from_video(cls, video) -> 'VideoInfo':
        safe_title = re.sub(r'[^\w\-_]', '_', video.title)
        return cls(
            title=video.title,
            url=video.watch_url,
            safe_title=safe_title
        )

In [None]:
class YouTubeTranscriber:
    def __init__(
        self,
        output_dir: str = "outputs",
        whisper_model: str = "base",
        language: str = "en",
        device: Optional[str] = None
    ):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.language = language
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        
        # Initialize whisper model
        self.model = whisper.load_model(whisper_model, device=self.device)
        
        # Configure logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
        
        # Configure yt-dlp options
        self.ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'quiet': True,
            'outtmpl': str(self.output_dir / '%(title)s.%(ext)s'),
        }

    def _format_timestamp(self, seconds: float) -> str:
        """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millisecs = int((seconds - int(seconds)) * 1000)
        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"

    def _create_srt_content(self, segments: List[dict]) -> str:
        """Generate SRT formatted content from transcript segments"""
        srt_parts = []
        for i, segment in enumerate(segments, 1):
            start = self._format_timestamp(segment["start"])
            end = self._format_timestamp(segment["end"])
            text = segment["text"].strip()
            srt_parts.append(f"{i}\n{start} --> {end}\n{text}\n")
        return "\n".join(srt_parts)

    def _download_audio(self, video_info: VideoInfo) -> Path:
        """Download audio from YouTube video"""
        self.logger.info(f"Downloading audio for: {video_info.title}")
        with yt_dlp.YoutubeDL(self.ydl_opts) as ydl:
            ydl.download([video_info.url])
        
        # Find the downloaded file
        audio_file = next(self.output_dir.glob(f"{video_info.safe_title}*.mp3"))
        return audio_file

    def _get_youtube_captions(self, video) -> Optional[str]:
        """Attempt to get YouTube's own captions"""
        if not video.captions:
            return None
            
        for caption in video.captions:
            if caption.code == self.language:
                return caption.generate_srt_captions()
        return None

    def _transcribe_with_whisper(self, audio_path: Path) -> str:
        """Generate transcription using Whisper"""
        self.logger.info(f"Generating transcription with Whisper for: {audio_path.name}")
        
        result = self.model.transcribe(
            str(audio_path),
            language=self.language,
            task="transcribe",
            fp16=torch.cuda.is_available()
        )
        
        return self._create_srt_content(result["segments"])

    def process_query(self, query: str, num_videos: int = 1) -> None:
        """Process YouTube search query and generate transcriptions"""
        self.logger.info(f"Processing query: {query} for {num_videos} videos")
        
        # Search for videos
        search = Search(query)
        if not search.videos:
            self.logger.error("No videos found for query")
            return

        # Process each video
        for video in search.videos[:num_videos]:
            try:
                video_info = VideoInfo.from_video(video)
                self.logger.info(f"Processing video: {video_info.title}")
                
                # Download audio
                audio_path = self._download_audio(video_info)
                
                # Try to get YouTube captions first
                transcription = self._get_youtube_captions(video)
                source = "YouTube"
                
                # If no YouTube captions, use Whisper
                if not transcription:
                    transcription = self._transcribe_with_whisper(audio_path)
                    source = "Whisper"
                
                # Save transcription
                srt_path = self.output_dir / f"{video_info.safe_title}.srt"
                srt_path.write_text(transcription)
                self.logger.info(f"Saved {source} transcription to: {srt_path}")
                
            except Exception as e:
                self.logger.error(f"Error processing video {video_info.title}: {str(e)}")
                continue


In [None]:
def main():
    # Example usage
    transcriber = YouTubeTranscriber(
        output_dir="youtube_outputs",
        whisper_model="base",
        language="en"
    )
    transcriber.process_query("machine learning basics", num_videos=3)

In [None]:
if __name__ == "__main__":
    main()