In [9]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
torch.cuda.empty_cache()
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16, 
    attn_implementation="eager",  
    device_map={"": "cuda"}
)
processor = AutoProcessor.from_pretrained(model_name,use_fast=True)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.39s/it]


In [10]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "image": "C:/AI/Github/Reconnaissance_drone_report/Data/Images/earthquake/download1.jpg",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {"type": "text", "text": "Consider yourself as a airforce pilot who is operating a drone at this moment, explain this event brief."},
        ],
    }
]

In [11]:
messages

[{'role': 'user',
  'content': [{'type': 'video',
    'image': 'C:/AI/Github/Reconnaissance_drone_report/Data/Images/earthquake/download1.jpg',
    'max_pixels': 151200,
    'fps': 1.0},
   {'type': 'text',
    'text': 'Consider yourself as a airforce pilot who is operating a drone at this moment, explain this event brief.'}]}]

In [None]:
import time
st=time.time()
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
end=time.time()
print("".join(output_text))
print("Time took to generate: ",end-st)

In [None]:
print("".join(output_text))

In [13]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [6]:
!pip install -U bitsandbytes



In [8]:
!pip install opencv-python pillow


Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
   ---------------------------------------- 0.0/39.5 MB ? eta -:--:--
   - -------------------------------------- 1.0/39.5 MB 12.7 MB/s eta 0:00:04
   ------ --------------------------------- 6.0/39.5 MB 21.8 MB/s eta 0:00:02
   ----------- ---------------------------- 11.8/39.5 MB 24.6 MB/s eta 0:00:02
   ------------------- -------------------- 19.7/39.5 MB 28.2 MB/s eta 0:00:01
   --------------------------- ------------ 27.3/39.5 MB 29.8 MB/s eta 0:00:01
   ------------------------------------ --- 35.7/39.5 MB 31.9 MB/s eta 0:00:01
   ---------------------------------------- 39.5/39.5 MB 31.4 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86
