<a href="https://colab.research.google.com/github/HossamSaoud/MachineLearning_Notebooks/blob/main/Minutes_Meeting_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2

In [2]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [4]:
AUDIO_MODEL = "whisper-1"
GEMMA2 = "google/gemma-2-2b-it"

In [5]:
openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

In [6]:
# audio_file = open(audio_filename, "rb")
# transcript = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file)

In [7]:
#print(transcript.text)

In [8]:
# system_message ="You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
 #user_prompt = f"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcript}"


In [9]:
messages =[]

In [10]:
#Helper function to load model and tokenizer
def load_model(model_name):
    quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quant_config, trust_remote_code=True)
    return tokenizer, model

In [11]:
def generate_full_response (tokenizer,model,user_input, max_tokens1=1000):
  global messages
  messages.append({"role": "user", "content" : user_input})
  #pass inputs to tokenzier then pass tokens to the model and generate response, then print it
  inputs = tokenizer.apply_chat_template(messages,return_tensors="pt",add_generation_prompt=True).to("cuda")
  outputs = model.generate(inputs,max_new_tokens=max_tokens)
  response = tokenizer.decode(outputs[0])
  messages.append({"role":"assistant", "content": response})
  print(response)
  del tokenizer, model,inputs,outputs
  torch.cuda.empty_cache()

In [12]:
def generate_stream_low_level(tokenizer,model,user_input, max_tokens=1000):
  global messages
  messages.append({"role": "user", "content" : user_input})
  messages.append({"role":"assistant", "content": ""})
  input_ids = tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")
  response = ""
  for _ in range(max_tokens):
    outputs = model(input_ids)
    next_token_id = outputs.logits[: ,-1].argmax(dim=-1).unsqueeze(-1)
    input_ids = torch.cat([input_ids,next_token_id[0]],dim=-1)
    next_token = tokenizer.decode(next_token_id[0])
    print(next_token,end="",flush=True)
    response+=next_token
    if next_token_id.item() == tokenizer.eos_token_id:
      break
    messages.append({"role":"assistant", "content": response})
    print()
    del tokenizer, model
    torch.cuda.empty_cache()

In [13]:
from transformers import TextIteratorStreamer
import threading
def generate_stream_high_level(tokenizer,model,user_input, max_tokens=1000):
  global messages
  messages.append({"role": "user", "content" : user_input})

  input_ids = tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")
  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt = True,
      decode_kwargs = {"skip_special_tokens":True}
  )

  thread = threading.Thread(
      target=model.generate,
      kwargs={
          "inputs":inputs,
          "max_new_tokens":max_tokens,
          "streamer":streamer
      }
  )
  thread.start()

  for text_chunk in streamer:
    filtered_chunk = text_chunk.replace("<|eot_id|>","")
    print(filtered_chunk,end="",flush=True)
  print()
  del tokenizer, model ,inputs,outputs
  torch.cuda.empty_cache()



In [14]:
tokenizer, model = load_model(GEMMA2)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
#generate_full_response(tokenizer,model,user_prompt)



In [16]:
# del tokenizer, model
# torch.cuda.empty_cache()


In [17]:
# tokenizer, model = load_model(GEMMA2)

In [18]:
#generate_stream_low_level(tokenizer,model,user_prompt)

In [19]:
#generate_stream_high_level(tokenizer,model,user_prompt)

In [20]:
# def generate_stream(user_input):
#   global tokenizer,model,messages,max_tokens
#   messages.append({"role": "user", "content" : user_input})
#   input_ids = tokenizer.apply_chat_template(messages,return_tensors="pt").to("cuda")
#   result=""

#   for _ in range(max_tokens):
#     outputs = model(inputs_id)
#     next_token_id = outputs.logits[: ,-1].argmax(dim=-1).unsqueeze(-1)
#     input_ids = torch.cat([input_ids,next_token_id[0]],dim=-1)
#     next_token = tokenizer.decode(next_token_id[0],skip_special_tokens=True)
#     result+=next_token
#     yield result
#     if next_token_id.item() == tokenizer.eos_token_id:
#       break

#     messages.append({"role":"assistant", "content": "result"})

In [21]:
max_tokens = 1000

In [26]:
def process_audio(audio_filename):
  audio_file = open(audio_filename, "rb")
  history = []
  transcript = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file)
  user_prompt = f"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcript.text}"
  yield from generate_stream_optimized(user_prompt,history)



In [27]:
# optimize the streaming function for gradio (using TextIteratorStreamer)
def generate_stream_optimized(user_input,history):
  # Global variables for modifications
  global tokenizer, model, max_tokens

  # Step 1: Append the user's new message to the conversation history
  messages = history + [{"role": "user", "content": user_input}]

  # Step 2: Prepare the inputs for the model by applying the chat template
  # The inputs include the conversation history and the user's latest message
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  # we skip using TextStreamer() here cause it streams back results to stdout and thats not what we want in gradio app
  # we use TextIteratorStreamer() instead

  # Step 3: Initialize the TextIteratorStreamer
  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt=True,  # Ensures that the input prompt is not repeatedly included in the streamed output.
      decode_kwargs={"skip_special_tokens": True}  # Filters out special tokens (e.g., <s>, </s>, <pad>, <cls>, <sep>) from the generated text.
  )

  # Step 4: Create a thread to run the generation process in the background
  thread = threading.Thread(
      target=model.generate,  # Specifies that the model's `generate` method will be run in the thread.
      kwargs={                           # Passes the arguments required for text generation
          "inputs": inputs,              # The tokenized input prompt for the model.
          "max_new_tokens": max_tokens,  # Limits the number of tokens to be generated.
          "streamer": streamer           # The TextIteratorStreamer to handle streaming the output.
          }
  )

  # Step 5: Start the thread to begin the generation process
  thread.start()

  # Step 6: Initialize an empty string to accumulate the growing output
  accumulated_reply = ""

  # Step 7: Stream the output progressively
  for text_chunk in streamer:  # Iterate over each chunk of text streamed by the model
      # Filter out any unexpected special tokens manually if they appear to ensure a clean output
      # `<|eot_id|>` is a special token (e.g., end-of-text marker) that may still appear in some outputs
      filtered_chunk = text_chunk.replace("<|eot_id|>", "")

      # Append the filtered chunk to the accumulated text that holds all the generated text seen so far
      accumulated_reply += filtered_chunk

      # Yield the accumulated text to the calling function/UI for progressive updates,
      # ensuring the output is continuously refreshed with new content
      yield accumulated_reply

  # Step 8: Append the final assistant response to the conversation history
  messages.append({"role": "assistant", "content": accumulated_reply})

In [24]:
 !pip install gradio



In [28]:
import gradio as gr


# # Gradio interface
# interface = gr.Interface(
#     fn=process_audio,  # Function to process the uploaded audio
#     inputs=gr.Audio(type="filepath", label="Upload Audio"),  # Audio input
#     outputs = [gr.Textbox(gr.Markdown(label = "result"))],  # Markdown supports dynamic content updates
#     title="Audio File Uploader",
#     description="Upload an audio file and press Submit to get a streamed response.",
#     flagging_mode= "never"
# )

# # Launch the interface
# interface.launch(debug=True)

with gr.Blocks() as demo:
    gr.Markdown("# Chat with AI (Streaming Enabled)")
    with gr.Row():
      with gr.Column():
        user_input = gr.Audio(type="filepath", label="Upload Audio")
        output_box = gr.Markdown(label="AI Response", min_height=50)
        send_button = gr.Button("Send")

    send_button.click(fn=process_audio, inputs=user_input, outputs=output_box)

demo.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://47feff6fde8f7c884b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://47feff6fde8f7c884b.gradio.live


