This Code will take in a MP3 audio file with Gradio, send it to OpenAI for decoding and summarization, and then Gradio will display the summary in Markdown.  

In [8]:
# Install the needed stuff

!pip install -q requests torch gradio bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.4/320.4 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr

In [3]:
# Constants

AUDIO_MODEL = "whisper-1"
SUMMARIZE_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [4]:
# Sign in to HuggingFace Hub and OpenAI

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

In [5]:
# Get the tokenizer ready

# Quantizing down to 4 bits double using Bits & Bytes Library
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(SUMMARIZE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

# The model may take a few minutes to download if this is a first run.
model = AutoModelForCausalLM.from_pretrained(
    SUMMARIZE_MODEL,
    device_map="auto",
    quantization_config=quant_config
)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [10]:
# Transcribe the audio file.

def transcribe_audio(audio_path, progress=gr.Progress()):
    progress(0.3, desc="Converting Audio To Text...")

    try:
        with open(audio_path, "rb") as audio_file:
            transcription = openai.audio.transcriptions.create(
                model=AUDIO_MODEL,
                file=audio_file,
                response_format="text"
            )
            return transcription
    except Exception as e:
        return f"Error during transcription: {str(e)}"

In [15]:
# Generate the summary

def generate_summary(transcription, model, tokenizer, progress=gr.Progress()):
    progress(0.6, desc="Generating the Summarriezed Text...")

    system_message = "You are an assistant that produces summaries from transcripts, with an concise topic of conversation, general tone of the audio, general key discussion points and takeaways in markdown."
    user_prompt = f"Below is an extract transcript of a audio recording of a person or people. Please write a summary in markdown, including topic; tone; discussion points; and takeaways.\n{transcription}"

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=2000)
    response = tokenizer.decode(outputs[0])

    progress(0.9, desc="Formatting ...")
    response = response.split("<|end_header_id|>")[-1].strip().replace("<|eot_id|>","")

    return response

In [17]:
# This drives the whole process and is what Gradio calls after it gets the input
# It call the above functions in the right order and outputs the summary that Gradio displays.

def process_upload(audio_file, progress=gr.Progress()):
    progress(0.1, desc="Initializing...")

    if audio_file is None:
        return "Please upload an audio file."

    try:
        # Could add other options like WAV but limiting to MP3
        if not str(audio_file).lower().endswith('.mp3'):
            return "The file must be an MP3 file. Please try again."

        # Run the audio to text conversion...
        transcription = transcribe_audio(audio_file)
        if transcription.startswith("Error"):
            return transcription

        # Generate the summary
        summary = generate_summary(transcription, model, tokenizer)
        progress(1.0, desc="Finished!")
        return summary

    except Exception as e:
        return f"Error processing file: {str(e)}"

In [18]:
# Create Gradio interface

interface = gr.Interface(
    fn=process_upload,
    inputs=gr.Audio(type="filepath", label="Upload MP3 File", format="mp3"),
    outputs=gr.Markdown(label="Summary", min_height=60),
    title="Audio To Text Summarizer",
    description="Upload an MP3 recording with human spoken words to get an AI-generated summary. This process may take a few minutes.",
    flagging_mode="never"
)

In [19]:
# Launch Gradio interface

interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0b89778849cdaaf757.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


