In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import yt_dlp
import os

# Print PyTorch device information to verify MPS availability
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS backend exists: {torch.backends.mps.is_built()}")

PyTorch version: 2.5.1
MPS available: True
MPS backend exists: True


In [2]:
class AudioTranscriber:
    def __init__(self):
        # Initialize device - use MPS if available
        self.device = "mps" if torch.backends.mps.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Load model and processor
        model_id = "distil-whisper/distil-large-v2"
        
        print("Loading model...")
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            use_safetensors=True
        )
        self.model.to(self.device)
        
        print("Loading processor...")
        self.processor = AutoProcessor.from_pretrained(model_id)
        
        print("Creating pipeline...")
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            return_timestamps=True,
            torch_dtype=torch.float16,
            device=self.device,
        )
        print("Setup complete!")

    def download_audio(self, youtube_url, output_path="audio"):
        """Download audio from YouTube URL"""
        if not os.path.exists(output_path):
            os.makedirs(output_path)
            
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': f'{output_path}/%(title)s.%(ext)s',
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=True)
            audio_path = f"{output_path}/{info['title']}.mp3"
            return audio_path

    def transcribe(self, youtube_url):
        """Transcribe audio from YouTube URL"""
        try:
            # Get notebook's directory
            notebook_dir = os.getcwd()
            print(f"Working directory: {notebook_dir}")
            
            # Download audio
            print("Downloading audio...")
            audio_path = self.download_audio(youtube_url)
            
            # Transcribe
            print("Transcribing...")
            result = self.pipe(
                audio_path,
                generate_kwargs={"language": "en", "task": "transcribe"}
            )
            
            # Save transcription
            output_file = os.path.join(notebook_dir, "transcription.txt")
            with open(output_file, "w") as f:
                f.write(result["text"])
            print(f"\nTranscription saved to: {output_file}")
            
            # Clean up audio file
            os.remove(audio_path)
            
            return result["text"]
            
        except Exception as e:
            print(f"Error during transcription: {str(e)}")
            return None

In [3]:
# Create transcriber instance
transcriber = AudioTranscriber()

Using device: mps
Loading model...
Loading processor...


Device set to use mps


Creating pipeline...
Setup complete!




In [4]:
# Test with a short YouTube video (replace with your URL)
youtube_url = "https://youtu.be/Tn-XvYG9x7w?si=orngdjVQsMOAjJn_"
transcription = transcriber.transcribe(youtube_url)

if transcription:
    print("\nTranscription Preview (first 500 characters):")
    print(transcription[:500] + "...")

Working directory: /Users/jonathanlau/Downloads/repos/STT_Local_Mac
Downloading audio...
[youtube] Extracting URL: https://youtu.be/Tn-XvYG9x7w?si=orngdjVQsMOAjJn_
[youtube] Tn-XvYG9x7w: Downloading webpage
[youtube] Tn-XvYG9x7w: Downloading ios player API JSON
[youtube] Tn-XvYG9x7w: Downloading mweb player API JSON
[youtube] Tn-XvYG9x7w: Downloading m3u8 information
[info] Tn-XvYG9x7w: Downloading 1 format(s): 251
[download] audio/Get Abs In 60 Days (Using Science).webm has already been downloaded
[download] 100% of    9.88MiB
[ExtractAudio] Destination: audio/Get Abs In 60 Days (Using Science).mp3
Deleting original file audio/Get Abs In 60 Days (Using Science).webm (pass -k to keep)
Transcribing...


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Transcription saved to: /Users/jonathanlau/Downloads/repos/STT_Local_Mac/transcription.txt

Transcription Preview (first 500 characters):
 What does your midsection look like right now? For reference, this is what it would look like at 50% body fat. At 40%, your waistline is smaller, but your abs are still buried. At 30%, your stomach is much flatter, but you still don't have visible. Once you get to 20%, that's where your abs become visible. And at 10% body fat, you'll have a well-defined, assuming you've developed your abs through proper training, which we'll get to. At 6% body fat, you'd be truly shredded, lean enough for a pro...


In [5]:
from transformers import pipeline
import os

class TranscriptionSummarizer:
    def __init__(self):
        print("Initializing summarization model...")
        # We'll use facebook/bart-large-cnn as it's optimized for summarization
        self.summarizer = pipeline(
            "summarization",
            model="facebook/bart-large-cnn",
            device=0 if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
        )
        print("Summarization model ready!")
    
    def summarize_file(self, file_path, max_length=130, min_length=30):
        """
        Reads a text file and generates a summary.
        
        Parameters:
            file_path (str): Path to the text file
            max_length (int): Maximum length of the summary in tokens
            min_length (int): Minimum length of the summary in tokens
            
        Returns:
            str: Generated summary
        """
        try:
            # Read the transcription file
            with open(file_path, 'r') as file:
                text = file.read()
            
            print(f"Processing text of length: {len(text)} characters")
            
            # For longer texts, we need to chunk them as BART has a token limit
            # Most models can handle around 1024 tokens at once
            chunks = self._chunk_text(text)
            
            # Generate summary for each chunk and combine
            summaries = []
            for i, chunk in enumerate(chunks):
                print(f"Summarizing chunk {i+1}/{len(chunks)}...")
                summary = self.summarizer(
                    chunk,
                    max_length=max_length,
                    min_length=min_length,
                    do_sample=False
                )[0]['summary_text']
                summaries.append(summary)
            
            # Combine summaries
            final_summary = " ".join(summaries)
            
            # Save the summary
            summary_path = file_path.replace('.txt', '_summary.txt')
            with open(summary_path, 'w') as f:
                f.write(final_summary)
            
            print(f"\nSummary saved to: {summary_path}")
            return final_summary
            
        except Exception as e:
            print(f"Error during summarization: {str(e)}")
            return None
    
    def _chunk_text(self, text, max_chunk_size=1000):
        """
        Splits text into smaller chunks while trying to maintain sentence integrity.
        This helps handle longer texts that exceed model's token limit.
        """
        sentences = text.split('. ')
        chunks = []
        current_chunk = []
        current_size = 0
        
        for sentence in sentences:
            sentence_size = len(sentence)
            if current_size + sentence_size > max_chunk_size and current_chunk:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentence]
                current_size = sentence_size
            else:
                current_chunk.append(sentence)
                current_size += sentence_size
                
        if current_chunk:
            chunks.append('. '.join(current_chunk) + '.')
            
        return chunks

# Let's use our summarizer
summarizer = TranscriptionSummarizer()

# Path to your transcription file (adjust if needed)
transcription_path = os.path.join(os.getcwd(), "transcription.txt")

# Generate and display summary
summary = summarizer.summarize_file(transcription_path)
if summary:
    print("\nSummary Preview:")
    print(summary)

Initializing summarization model...


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps


Summarization model ready!
Processing text of length: 9304 characters
Summarizing chunk 1/10...
Summarizing chunk 2/10...
Summarizing chunk 3/10...
Summarizing chunk 4/10...
Summarizing chunk 5/10...
Summarizing chunk 6/10...
Summarizing chunk 7/10...
Summarizing chunk 8/10...
Summarizing chunk 9/10...
Summarizing chunk 10/10...

Summary saved to: /Users/jonathanlau/Downloads/repos/STT_Local_Mac/transcription_summary.txt

Summary Preview:
At 6% body fat, you'd be truly shredded, lean enough for a pro-bodybuilding competition. Once you get to 20%, that's where your abs become visible. Most men want to be between 10 and 20% body tofact. It's popular these days to say that ab training is a waste of time. If you're not lean enough, you won't see the definition, anyway. I'm going to explain how to get your abs to pop regardless of the genetic hand you were dealt by using three science-based tools. If you build up your abdominal muscles through proper hypertrophy training, they will pop more