## Project Overview

### Problem
- Meetings generate **long audio recordings** that few people re-watch.  
- Manual note-taking is slow and often misses **actions, owners, and deadlines**.  
- Teams lose accountability and spend time asking “Who’s doing what?”

### Goal
Create an automated tool that turns any meeting recording into concise, shareable **minutes** that include:
1. Agenda / summary  
2. Discussion points & take-aways  
3. Action items with clear owners  

### Solution Overview
1. **Audio → Text**  
   - Send the recording to a *frontier* speech-to-text API (e.g., OpenAI Whisper) to get a raw transcript.
2. **Text → Minutes**  
   - Feed the transcript to an *open-source* LLM that:
     - Summarises the meeting  
     - Extracts actions, owners, and deadlines
3. **Streaming Output**  
   - Stream the generated minutes back in real time and display them in Markdown.

Result: fully-formatted meeting minutes delivered within minutes of the call—no human note-taker required.  


---

### Dataset

I will use the **MeetingBank_Audio** dataset, which contains audios:

> https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main

I download "Boston" audio.


In [4]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [22]:
!pip install -Uq bitsandbytes>=0.43.0 accelerate>=0.27.0 transformers>=4.41.0

In [23]:
import os
import torch
import requests
import argparse
import zipfile, pathlib
from openai import OpenAI
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login
from typing import Callable, Optional, List
from IPython.display import Markdown, display, update_display
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

In [24]:
# -----------------------------------------------------------------------------
# Helper class
# -----------------------------------------------------------------------------

class MeetingMinutesGenerator:
  """End‑to‑end audio → minutes generator."""
  def __init__(self,
               # ASR
               asr_backend: str = "openai",
               audio_model: str = "whisper-1",
               openai_api_key: Optional[str] = "None",
               hf_whisper_checkpoint: str = "openai/whisper-medium",
               # Summariser
               llm_model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
               hf_token: Optional[str] = "None",
               load_in_4bit: bool = True,
               streaming: bool = True,
               device: str = "auto",
               # Callback for streamed text (get tokens as they arrive)
               stream_callback: Optional[Callable[[str], None]] = "None"
               ):
    self.asr_backend = asr_backend
    self.audio_model = audio_model
    self.openai_api_key = openai_api_key or userdata.get('OPENAI_API_KEY')   # not colab: os.getenv("OPENAI_API_KEY")
    self.hf_whisper_checkpoint = hf_whisper_checkpoint

    self.llm_model = llm_model
    self.hf_token = hf_token or userdata.get("HF_TOKEN")   # not colab: os.getenv("HF_TOKEN")
    self.load_in_4bit = load_in_4bit
    self.streaming = streaming
    self.device = device
    self.stream_callback = stream_callback

    # Sign in to OpenAI using Secrets in Colab
    if self.asr_backend == "openai":
      if not self.openai_api_key:
        raise ValueError("OPENAI_API_KEY not set")
      self.openai_client = OpenAI(api_key=self.openai_api_key)

    # If you asked for 4-bit weights, build a BitsAndBytesConfig…
    if self.load_in_4bit:
      quant_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_compute_dtype=torch.bfloat16,
          bnb_4bit_quant_type="n4f"
      )
    else:
      quant_config = None

    self.tokenizer = AutoTokenizer.from_pretrained(self.llm_model, token=self.hf_token)  # login() and the token= argument do the same job
    self.tokenizer.pad_token = self.tokenizer.eos_token
    self.model = AutoModelForCausalLM.from_pretrained(
        self.llm_model,
        device_map=self.device,
        quantization_config=quant_config,
        token=self.hf_token
        )
    # choose streamer
    if self.streaming:
      self.streamer = TextStreamer(self.tokenizer)
    else:
      self.streamer = None


  # ASR helpers
  def loadWhisper(self) -> None:
    """Load open‑source Whisper as an HF pipeline for local ASR."""
    speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(
        self.hf_whisper_checkpoint,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        use_safetensors=True,
        token=self.hf_token
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    processor = AutoProcessor.from_pretrained(self.hf_whisper_checkpoint, token=self.hf_token)

    self.whisper_pipe = pipeline(
        "automatic-speech-recognition",
        model=speech_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch.float16,
        device="cuda" if torch.cuda.is_available() else "cpu"
        )

  def transcribe(self, audio_path: str) -> str:
    """Return text transcription of the audio file."""
    if self.asr_backend == "openai":
      with open(audio_path, "rb") as audio_file:
        resp = openai.audio.transcription.create(
            model=self.audio_model,
            file=audio_file,
            response_format="text"
        )
      return resp

    # Use the Whisper OpenAI model to convert the Audio to Text
    result = self.whisper_pipe(audio_path)
    return result["text"]


  # Summarisation prompt helpers
  PROMPT_TEMPLATE = (
        "You are an assistant that produces clear meeting minutes.\n"
        "Return Markdown containing:**\n\n"
        "**Date & Location**\n"
        "**Attendees**\n"
        "**Summary**\n"
        "**Discussion Points** (bullet list)\n"
        "**Take‑aways** (bullet list)\n"
        "**Action Items** (task, owner, deadline)\n\n"
        "Transcript:\n{transcript}\n"
    )

  def makeMessages(self, transcript: str) -> List[dict]:
    system_msg = {
        "role": "system",
        "content": "You are a helpful assistant that writes structured meeting minutes in Markdown."
        }
    user_msg = {
            "role": "user",
            "content": self.PROMPT_TEMPLATE.format(transcript=transcript),
        }

    return [system_msg, user_msg]


  def summarize(self, transcript: str, max_new_tokens: int = 2048) -> str:
    """Return Markdown minutes for the given transcript."""
    messages = self.makeMessages(transcript)

    inputs = self.tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(self.model.device)

    gen_kwargs = {
        "inputs": inputs,
        "max_new_tokens": max_new_tokens
    }
    if self.streamer is not None:
      gen_kwargs["streamer"] = self.streamer

    output_ids = self.model.generate(**gen_kwargs)

    # if streaming, TextStreamer already prints; but we still return full text
    return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)


  def run(self, audio_path: str) -> str:
    """Full pipeline: audio → minutes (Markdown)."""
    transcript = self.transcribe(audio_path)
    minutes_md = self.summarize(transcript)
    if self.stream_callback:
      self.stream_callback(minutes_md)
    return minutes_md

In [15]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
if __name__ == "__main__":
  #drive.mount("/content/drive", force_remount=True)

  zip_path = "/content/drive/MyDrive/Boston.zip"
  target_dir = pathlib.Path("/content/audio")
  with zipfile.ZipFile(zip_path) as zf:
      zf.extractall(target_dir)
  audio_file = sorted(target_dir.rglob("*.mp3"))[0]   # first MP3

  # 3- Pull secrets if running in Colab
  OPENAI_KEY = userdata.get("OPENAI_API_KEY")
  HF_TOKEN   = userdata.get("HF_TOKEN")

  # 4- Instantiate the helper
  mm = MeetingMinutesGenerator(
      asr_backend="openai",
      openai_api_key=OPENAI_KEY,
      hf_token=HF_TOKEN,
      llm_model="meta-llama/Meta-Llama-3.1-8B-Instruct",
      load_in_4bit=True,
  )

  # 5- Run end-to-end
  markdown_minutes = mm.run(str(audio_file))

  print("\n\nGenerated Minutes\n-----------------\n")
  print(markdown_minutes)