In [1]:
# install dependencies
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai

In [2]:
# Import libraries
from dotenv import load_dotenv
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import anthropic
from google import genai
# from google.colab import drive
from huggingface_hub import login
#from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, TextIteratorStreamer
import torch
import gradio as gr
import time
import threading
from datetime import datetime
import queue

### Environment Setup
I prefer to put my API Keys setups together in the beginning of the notebook, easier to reuse.

In [3]:
# Set up API keys and sign it to services if they exist
# Comment out the ones you're not using.

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
weather_api_key = os.getenv('WEATHER_API_KEY')
hf_api_key = os.getenv('HF_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
    openai = OpenAI(api_key=openai_api_key)
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
    claude = anthropic.Anthropic()
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
    gemini =  genai.Client(api_key=google_api_key)
    #ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key="ollama")
else:
    print("Google API Key not set")

if weather_api_key:
    print(f"Weather API Key exists and begins {weather_api_key[:7]}")
else:
    print("Weather API Key not set")

if hf_api_key:
    print(f"HuggingFace API Key exists and begins {weather_api_key[:7]}")
    login(hf_api_key, add_to_git_credential=True)
else:
    print("HuggingFace API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyB5
Weather API Key exists and begins 51c3669
HuggingFace API Key exists and begins 51c3669


In [4]:
# Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

## Audio Summarizer with Gradio UI and real time status updates
1. **Status Updates with Timers**
- Uses gr.State implicitly through the class instance
- Yielding from the processing function to update status in real-time
- Timer shows elapsed seconds for each processing step

2. **Real-time Status Display**
- Status box shows current operation and elapsed time
- Different status messages for transcription vs summarization
- Error handling with appropriate status messages

3. **File Validation**
- Checks if file exists and isn't too large
- Prevents multiple simultaneous processing attempts
- Shows file information when uploaded

4. **Streaming Summary Output**
- Summary appears in Markdown format after processing
- Can be updated progressively if needed

In [5]:
class StreamingAudioSummarizer:
    def __init__(self):
        self.processing = False
        self.start_time = None
        self.status_queue = queue.Queue()
        self.current_step = ""

    def speech_to_text(self, audio_file_path):
        """ Use an AI speech-to-text model AUDIO_MODEL to transcribe the audio. """
        self.current_step = "transcription"
        audio_file = open(audio_file_path, "rb")
        transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")
        return transcription

    def summarize_text_streaming(self, text):
        """ Generator that yields streaming summary results. 
        We use a HuggingFace model here stored in LLAMA"""
        self.current_step = "summarization"
       
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4"
        )
        
        system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
        user_prompt = f"""Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, 
        location and date; discussion points; takeaways; and action items with owners.\n{text}"""
        messages = [
           {"role": "system", "content": system_message},
           {"role": "user", "content": user_prompt}
        ]
        # Tokenize the input, pass it to the model and  and stream the model response.
        tokenizer = AutoTokenizer.from_pretrained(LLAMA)
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
        model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
        streamer = TextIteratorStreamer(
            tokenizer, 
            skip_prompt=True, 
            skip_special_tokens=True
        )
        # Use threading for the generator
        def generate():
            model.generate(
                inputs, 
                max_new_tokens=2000, 
                streamer=streamer,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )  
        thread = threading.Thread(target=generate)
        thread.start()
        result=""
        for new_text in streamer:
            result += new_text or ""
            yield result
        thread.join()

    def process_audio_with_streaming(self, audio_file):
        """Process audio with streaming status and summary"""
        if audio_file is None:
            yield "❌ No audio file selected", ""
            return
        
        if self.processing:
            yield "⚠️ Already processing a file. Please wait.", ""
            return
        
        # Validate file
        if not os.path.exists(audio_file):
            yield f"❌ File not found: {audio_file}", ""
            return
        
        file_size = os.path.getsize(audio_file)
        if file_size > 100 * 1024 * 1024:  # 100MB limit
            yield f"❌ File too large: {file_size/(1024*1024):.1f}MB (max 100MB)", ""
            return
        
        self.processing = True
        self.start_time = time.time()

        try:
            # Step 1: Speech-to-text
            yield "🎯 Starting transcription...", "*Preparing to transcribe audio...*"
            
            # Define progress callback to update the UI during transcription
            def transcription_progress(step):
                elapsed = int(time.time() - self.start_time)
                yield f"🎙️ Transcribing audio - {elapsed} seconds (step {step}/8)", "*Transcribing audio content...*"
            
            # Use threading to run transcription while updating progress
            transcription_result = None
            transcription_error = None
            
            def run_transcription():
                nonlocal transcription_result, transcription_error
                try:
                    transcription_result = self.speech_to_text(audio_file)
                except Exception as e:
                    transcription_error = e
            
            # Start transcription in a separate thread
            transcription_thread = threading.Thread(target=run_transcription)
            transcription_thread.start()
            
            # Update progress while transcription is running
            while transcription_thread.is_alive():
                elapsed = int(time.time() - self.start_time)
                yield f"🎙️ Transcribing audio - {elapsed} seconds", "*Transcribing audio content...*"
                time.sleep(1)
            
            # Wait for thread to complete
            transcription_thread.join()
            
            # Check for errors
            if transcription_error:
                raise transcription_error

            transcription_time = int(time.time() - self.start_time)
            
            yield f"✅ Transcription complete - {transcription_time} seconds", "*Transcription finished. Starting summarization...*"
            
            # Step 2: Streaming Summarization
            yield "🤖 Starting AI summarization...", "*Connecting to AI model...*"
            
            # Stream the summary as it's generated
            for partial_summary in self.summarize_text_streaming(transcription_result):
                elapsed = int(time.time() - self.start_time)
                status = f"📝 Generating summary - {elapsed} seconds"
                yield status, partial_summary
            
            # Final results
            total_time = int(time.time() - self.start_time)
            final_status = f"✅ Processing complete! Total time: {total_time} seconds"
            
            yield final_status, partial_summary

        except Exception as e:
            error_msg = f"❌ Error processing audio: {str(e)}"
            yield error_msg, f"**Error occurred during processing:**\n\n{str(e)}"
        
        finally:
            self.processing = False
            self.current_step = ""
    
    def cancel_processing(self):
        """Cancel current processing"""
        if self.processing:
            self.processing = False
            return "🛑 Processing cancelled"
        return "No processing to cancel"


# Instantiate the class
summarizer = StreamingAudioSummarizer()


Define CSS Stylesheet for streaming interface

In [6]:
streaming_css = """
.status-box {
    font-family: 'Courier New', monospace;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    border-radius: 10px;
    padding: 15px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}

.summary-box {
    background: #f8f9fa;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
    min-height: 400px;
}

.process-button {
    background: linear-gradient(45deg, #FF6B6B, #4ECDC4);
    border: none;
    color: white;
    font-weight: bold;
    transition: all 0.3s ease;
}

.cancel-button {
    background: linear-gradient(45deg, #ff7675, #fd79a8);
    border: none;
    color: white;
    font-weight: bold;
}

.streaming-indicator {
    animation: pulse 2s infinite;
}

@keyframes pulse {
    0% { opacity: 1; }
    50% { opacity: 0.5; }
    100% { opacity: 1; }
}
"""

In [7]:
# Create the streaming interface
with gr.Blocks(title="🎵 Streaming Audio Summarizer", css=streaming_css) as interface:
    gr.Markdown("""
    # 🎵 Streaming Audio Summarizer
    
    Upload an audio file and watch the AI-generated summary appear in real-time as it's being created!
    
    ✨ **Features:**
    - 🎙️ Real-time transcription progress
    - 📝 Streaming AI summarization
    - ⏱️ Live processing timers
    - 🛑 Cancel processing anytime
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            # File upload
            audio_input = gr.File(
                label="📁 Upload Audio File",
                file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac"],
                type="filepath"
            )
            
            # Control buttons
            with gr.Row():
                process_btn = gr.Button(
                    "🚀 Start Processing", 
                    variant="primary",
                    size="lg",
                    elem_classes="process-button"
                )
                cancel_btn = gr.Button(
                    "🛑 Cancel",
                    variant="secondary",
                    elem_classes="cancel-button"
                )
            
            # Status display
            status_box = gr.Textbox(
                label="📊 Processing Status",
                value="Ready to process audio file",
                interactive=False,
                lines=4,
                elem_classes="status-box"
            )
            
            # Progress indicator
            progress_bar = gr.HTML(
                value="<div style='text-align: center; color: #666;'>Upload a file to begin</div>"
            )
        
        with gr.Column(scale=2):
            # Streaming summary output
            summary_output = gr.Markdown(
                value="*🤖 AI-generated summary will stream here in real-time...*",
                elem_classes="summary-box"
            )
    
    # File info section
    with gr.Accordion("📋 Processing Details", open=False):
        file_details = gr.Markdown("*Upload a file to see processing details*")
    
    # Event handlers
    def show_file_details(file_path):
        if file_path:
            try:
                size = os.path.getsize(file_path)
                name = os.path.basename(file_path)
                ext = os.path.splitext(file_path)[1].lower()
                
                return f"""
                ### File Information
                - **Name**: {name}
                - **Size**: {size/(1024*1024):.2f} MB
                - **Format**: {ext.upper()}
                - **Path**: `{file_path}`
                
                ### Processing Pipeline
                1. **Speech-to-Text**: Convert audio to text using AI transcription
                2. **AI Summarization**: Generate intelligent summary using language models
                3. **Streaming Output**: Results appear in real-time as they're generated
                """
            except:
                return "❌ Could not read file information"
        return "*Upload a file to see processing details*"
    
    def update_progress(processing_status):
        if "complete" in processing_status.lower():
            return "<div style='color: #27ae60; font-weight: bold;'>✅ Processing Complete!</div>"
        elif "error" in processing_status.lower():
            return "<div style='color: #e74c3c; font-weight: bold;'>❌ Processing Error</div>"
        elif "processing" in processing_status.lower() or "transcribing" in processing_status.lower() or "generating" in processing_status.lower():
            return "<div style='color: #f39c12; font-weight: bold;' class='streaming-indicator'>🔄 Processing in Progress...</div>"
        else:
            return "<div style='color: #666;'>Ready to process</div>"
    
    # Wire up events
    audio_input.change(
        show_file_details,
        inputs=audio_input,
        outputs=file_details
    )
    
    process_btn.click(
        summarizer.process_audio_with_streaming,
        inputs=audio_input, 
        outputs=[status_box, summary_output]
    )
    
    cancel_btn.click(
        summarizer.cancel_processing,
        outputs=status_box
    )
    
    # Update progress indicator based on status
    status_box.change(
        update_progress,
        inputs=status_box,
        outputs=progress_bar
    )

In [None]:
# Launch the interface
interface.launch(
    debug=True,
    share=False
)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
