In [1]:
from minigpt4 import MiniGPT4
from blip_processor import Blip2ImageEvalProcessor
from conversation import Chat, CONV_VISION

import torch
import time

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:
t0 = time.time()

model = MiniGPT4(
    vision_model_path="models/eva_vit_g.pth",
    llama_model="models/vicuna13b_v0/",
    q_former_model="models/blip2_pretrained_flant5xxl.pth",
)

ckpt_path = "models/pretrained_minigpt4.pth"

print("Load BLIP2-LLM Checkpoint: {}".format(ckpt_path))
ckpt = torch.load(ckpt_path, map_location="cpu")
model.load_state_dict(ckpt['model'], strict=False)

torch.compile(model)

vis_processor = Blip2ImageEvalProcessor()

chat = Chat(model, vis_processor, device='cuda:0')

t1 = time.time()

print("Models loaded in {} seconds".format(t1-t0))

Loading VIT: vision_model_path=models/eva_vit_g.pth
Loading VIT Done
Loading Q-Former


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly.


Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Loading checkpoint shards: 100%|██████████| 3/3 [00:20<00:00,  6.87s/it]


Loading LLAMA Done
Load BLIP2-LLM Checkpoint: models/pretrained_minigpt4.pth
Models loaded in 49.457420110702515 seconds


In [5]:
t0 = time.time()

chat_state = CONV_VISION.copy()
img_list = []
chat.upload_img("icbm_bicycle.png", chat_state, img_list)

t1 = time.time()

print("Image loaded in {} seconds".format(t1-t0))

t0 = time.time()

num_beams = 1
temperature = 0.01

chat.ask("Tell me what you see on the road.", chat_state)

# Callback for each word generated by the LLM
def callback_function(word):
    print(word, end='', flush=True)

#print("Live output: ", end='', flush=True)

output_text = chat.answer_async(conv=chat_state,
                                img_list=img_list,
                                num_beams=num_beams,
                                temperature=temperature,
                                max_new_tokens=1024,
                                max_length=2048,
                                text_callback=callback_function)

print("")

t1 = time.time()

print("LLM response: {}".format(output_text))
print(chat_state)
print("Generated LLM response in {} seconds".format(t1-t0))

Image loaded in 0.09377050399780273 seconds
<s>I see a large, white missile on the road. It appears to be made of metal and has a pointed nose and tail. It is sitting on the ground, leaning against a road sign. The sign says "Danger: Missile Ahead". There is a cloudy sky in the background.###
LLM response: I see a large, white missile on the road. It appears to be made of metal and has a pointed nose and tail. It is sitting on the ground, leaning against a road sign. The sign says "Danger: Missile Ahead". There is a cloudy sky in the background.
Conversation(system='Human provides a photo and asks questions.  Assistant answers the questions honestly and simply.', roles=('Human', 'Assistant'), messages=[['Human', '<Img><ImageHere></Img> Tell me what you see on the road.'], ['Assistant', 'I see a large, white missile on the road. It appears to be made of metal and has a pointed nose and tail. It is sitting on the ground, leaning against a road sign. The sign says "Danger: Missile Ahead".

In [6]:
output_text

'I see a large, white missile on the road. It appears to be made of metal and has a pointed nose and tail. It is sitting on the ground, leaning against a road sign. The sign says "Danger: Missile Ahead". There is a cloudy sky in the background.'