In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
from IPython.display import Image, display
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc
from google.colab import userdata
from huggingface_hub import login

In [2]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [3]:
openai_api_key = userdata.get('OPENAI_API_KEY')

openai = OpenAI(api_key=openai_api_key)

In [None]:
def get_transcription(audio_file_path):
    print("get_transcription called")
    audio_file = open(audio_file_path, "rb")
    Model = "gpt-4o-mini-transcribe"
    transcription = openai.audio.transcriptions.create(
        model=Model,
        file=audio_file,
        response_format="text"
    )
    return transcription

In [5]:
# A Json file to describe get_transcription function
function_description = {
    "name": "get_transcription",
    "description": "Transcribes an audio file to text using OpenAI's transcription model.",
    "parameters": {
        "type": "object",
        "properties": {
            "audio_file_path": {
                "type": "string",
                "description": "The path to the audio file to be transcribed."
            }
        },
        "required": ["audio_file_path"]
    }
}

In [6]:
def summarize_transcription(transcription_text):
    print("summarize_transcription called")
    prompt = f"Summarize the following transcription in a concise manner:\n\n{transcription_text}"
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes transcriptions."},
            {"role": "user", "content": prompt}
        ]
    )
    summary = response.choices[0].message.content
    return summary

In [7]:
# A Json file to descrbe summarize_transcription function
summary_function_description = {
    "name": "summarize_transcription",
    "description": "Summarizes a given transcription text using OpenAI's GPT model.",
    "parameters": {
        "type": "object",
        "properties": {
            "transcription_text": {
                "type": "string",
                "description": "The transcription text to be summarized."
            }
        },
        "required": ["transcription_text"]
    }
}

In [8]:
def get_response(user_input, audio_path=None):
    print("get_response called")
    if audio_path:
        transcription = get_transcription(audio_path)
        user_input = f"{user_input}\n\nAudio transcription: {transcription}"

    tools = [
        {"type": "function", "function": function_description},
        {"type": "function", "function": summary_function_description}
    ]

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": user_input}
        ],
        tools=tools,
        tool_choice="auto"
    )

    message = response.choices[0].message

    if message.tool_calls:
        for tool_call in message.tool_calls:
            function_name = tool_call.function.name
            arguments = json.loads(tool_call.function.arguments)

            if function_name == "get_transcription":
                transcription = get_transcription(arguments["audio_file_path"])
                return f"Transcription: {transcription}"
            elif function_name == "summarize_transcription":
                summary = summarize_transcription(arguments["transcription_text"])
                return f"Summary: {summary}"

    return message.content

In [9]:
message_input = gr.Textbox(label="Enter your message")
audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload an audio file")
output_box = gr.Textbox(label="Response", lines=10)

view = gr.Interface(
    fn=get_response,
    inputs=[message_input, audio_input],
    outputs=[output_box],
    title="Multimodal Chatbot",
    description="A chatbot that can process text and audio inputs using OpenAI's function calling feature.",
    flagging_mode="never"
)
view.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2d720247a72c0a45c1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [9]:
!pip install -U bitsandbytes accelerate

import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
gc.collect()
torch.cuda.empty_cache()

tokenizer_hf = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer_hf.pad_token = tokenizer_hf.eos_token

model_hf = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
)

def get_response_hf(user_input, audio_file):
    print("get_response_hf called")
    if audio_file:
        transcription = get_transcription(audio_file)
        user_input = f"{user_input}\n\nAudio transcription: {transcription}"

    system_message = """
You are a helpful assistant that can process text and audio inputs.
"""
    messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_input}
  ]

    inputs = tokenizer_hf.apply_chat_template(messages, return_tensors="pt").to("cuda")

    outputs = model_hf.generate(inputs, max_new_tokens=2000)
    decoded_output = tokenizer_hf.decode(outputs[0], skip_special_tokens=True)

    return decoded_output



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [10]:
message_input = gr.Textbox(label="Enter your message")
audio_input = gr.Audio(sources=["upload"], type="filepath", label="Upload an audio file")
output_box = gr.Textbox(label="Response", lines=10)

view = gr.Interface(
    fn=get_response_hf,
    inputs=[message_input, audio_input],
    outputs=[output_box],
    title="Multimodal Chatbot",
    description="A chatbot that can process text and audio inputs using Hugging Face.",
    flagging_mode="never"
)
view.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1d0bd2afab21eea8d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


