In [None]:
%pip install git+https://github.com/openai/whisper.git@v20230918
%pip install openai~=0.28.1
%pip install elevenlabs==0.2.21
%pip install gradio~=3.41.0
# Install `llava` to interact with LLaVA 1.5.
%pip install -qq git+https://github.com/haotian-liu/LLaVA.git@v1.1.0

# LLaVA functions

In [None]:
# Define load model and load image functions.
import urllib.request
from functools import lru_cache
from llava.model.builder import load_pretrained_model
from PIL import Image

@lru_cache
def load_llava_model(model_path="radix-ai/llava-v1.5-7b"):
    # The original liuhaotian/llava-v1.5-7b has a shard size of 10GB which
    # causes an out of memory error on Colab [1]. To fix this, we uploaded the
    # model with 2GB shards to radix-ai/llava-v1.5-7b. Larger versions of this
    # model are also available [2].
    #
    # [1] https://github.com/haotian-liu/LLaVA/issues/496
    # [2] https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md.
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path=model_path,
        model_base=None,
        model_name=model_path.split("/")[-1],
        load_8bit=True,  # Quantize to 8 bit to fit on Google Colab's T4 GPU.
        load_4bit=False
    )
    return tokenizer, model, image_processor, context_len

@lru_cache
def load_image(image_url):
    # image_filename, _ = urllib.request.urlretrieve(image_url)
    image = Image.open(image_url).convert("RGB")
    return image

In [None]:
# Load the model.
llava_model = load_llava_model()

In [None]:
# Load an image.
# image = load_image("https://i.imgur.com/gFmBBCw.jpg")
print(type("monopoly.jpeg"))
image = load_image("monopoly.jpeg")
image

In [None]:
# Define model inference function, based on `llava.serve.cli`.
import torch
from llava.conversation import conv_templates, SeparatorStyle
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
from llava.utils import disable_torch_init
from transformers import TextStreamer

def ask_question(model, image, question):
    # Unpack model.
    tokenizer, model, image_processor, context_len = model
    disable_torch_init()
    # Convert image.
    image_tensor = image_processor.preprocess(image, return_tensors="pt")["pixel_values"].half().cuda()
    # Generate prompt.
    conv_mode = "llava_v1"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    inp = DEFAULT_IMAGE_TOKEN + "\n" + question
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    # Inference.
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            do_sample=True,
            temperature=0.2,
            max_new_tokens=1024,
            streamer=streamer,
            use_cache=True,
            stopping_criteria=[stopping_criteria]
        )
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    return outputs

In [None]:
import getpass
elevenlabs_pw = getpass.getpass(prompt="ElevenLabs API key: ")
openai_pw = getpass.getpass(prompt="OpenAI API key: ")

In [None]:
# Authenticate with ElevenLabs and OpenAI.
import openai
from elevenlabs import set_api_key
set_api_key(elevenlabs_pw)
openai.api_key = openai_pw

In [None]:
# Load Whisper model.
import whisper
whisper_model = whisper.load_model("small")

# Make chatbot with speech

In [None]:
# Make chatbot + add speech feature
import gradio as gr
from elevenlabs import generate, save

def add_text(chat_history, text_input):
    # Add a user message to the chat history.
    chat_history += [(text_input, None)] # text_input = User message, None = chatbot message
    return chat_history, ""

def add_image(chat_history, image_input):
    # Add an image to the chat history.
    #TODO: save image somewhere to load it into the "load_image" function
    name = image_input.name.split()
    file_name = name[-1]
    image = load_image(file_name)
    question = "Please analyse what this image is about. Do not return any personal information contained in the image."
    response = ask_question(llava_model, image, question)
    chat_history += [((image_input.name,), response)]
    yield chat_history

def add_audio(chat_history, audio_input):
    # Convert audio input file to transcription.
    audio = whisper.load_audio(audio_input)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    transcription = whisper.decode(whisper_model, mel)
    chat_history += [(transcription.text, None)]
    return chat_history

def add_llm_response(chat_history):
    # Convert chat_history to OpenAI's format.
    messages = []
    messages.append({"role": "system", "content": "You are a helpful assistant"})
    for pair in chat_history:
        if isinstance(pair[0], str):
            messages.append({"role": "user", "content": pair[0]})
        if isinstance(pair[1], str):
            messages.append({"role": "assistant", "content": pair[1]})
    # Request streaming response from GPT.
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
        stream=True
    )
    # Stream response to chat history as it arrives.
    for chunk in response:
        if "content" in chunk["choices"][0]["delta"]:
            if not chat_history[-1][1]:
                chat_history[-1][1] = ""
            chat_history[-1][1] += chunk["choices"][0]["delta"]["content"]
        yield chat_history

def play_llm_response(chat_history):
    # Generate speech from the LLM response.
    audio_filepath = "llm_response.wav"
    audio = generate(
        text=chat_history[-1][1],
        voice="Grace",
        model="eleven_multilingual_v2"
    )
    save(audio, audio_filepath)
    return audio_filepath

In [None]:
# Create demo app.
with gr.Blocks() as demo:

    # Create widgets.
    chat_history = gr.Chatbot([], avatar_images=("VDAB_logo_donkerblauw_RGB.jpg", "monopoly.jpeg"))
    with gr.Row():
        text_input = gr.Textbox(scale=2, placeholder="✍️ Enter message", container=False)
        image_input = gr.UploadButton("📷 Upload image", file_types=["image"])
        audio_input = gr.Audio(source="microphone", type="filepath", container=False)
        audio_output = gr.Audio(type="filepath", autoplay=True, visible=False)

    # Link widget events to Python functions.
    text_input.submit(add_text, [chat_history, text_input], [chat_history, text_input], queue=False).then(
        add_llm_response, chat_history, chat_history).then(
        play_llm_response, chat_history, audio_output, queue=False)
    image_input.upload(add_image, [chat_history, image_input], [chat_history], queue=False).then(
        play_llm_response, chat_history, audio_output, queue=False)
    audio_input.stop_recording(add_audio, [chat_history, audio_input], [chat_history], queue=False).then(
        add_llm_response, chat_history, chat_history).then(
        lambda: None, None, audio_input, queue=False).then(
        play_llm_response, chat_history, audio_output, queue=False)

demo.queue()
demo.launch(share=True)