# Install dependencies

In [1]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-90vnkfq6
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-90vnkfq6

  Resolved https://github.com/huggingface/transformers to commit 8bd2b1e8c23234cd607ca8d63f53c1edfea27462
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
!pip install qwen-vl-utils



In [3]:
!pip install torchvision



In [4]:
!pip install accelerate



In [1]:
import torch
import os
import json
import time
import re
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

In [2]:
START = time.time()
LOCAL_RUN = False
IMAGE_PATH = 'vlm-drones/images/'
INFERENCE_RESULTS = 'vlm-drones/output/inference_results.json'

In [3]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(device)

cuda


In [4]:
LOAD_MODEL = time.time()
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
print(f"Model loaded in {time.time() - LOAD_MODEL} seconds")

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model loaded in 79.12822079658508 seconds


In [5]:
LOAD_PROCESSOR = time.time()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print(f"Processor loaded in {time.time() - LOAD_PROCESSOR} seconds")

Processor loaded in 0.519768238067627 seconds


In [6]:
def extract_coordinates(output_text):
    coord_pattern = r'\(([\d\.\,\s]+)\)'
    match = re.search(coord_pattern, output_text)
    
    if match:

        coordinates = match.group(1).split(',')
        coordinates = list(map(float, coordinates))
        
        if len(coordinates) == 4:
            return coordinates
        else:
            return 0
    else:
        return 0

In [7]:
def process_images(data_directory):
    prompt = "Detect drones in the image. If a drone is detected, return only the bounding box coordinates normalized between 0 and 1, in the format (x, y, w, h), where (x, y) is the top-left corner of the bounding box relative to the image dimensions, and w and h are the width and height relative to the image dimensions. No other text or information; only the coordinates."
    results_dict = {}
    quantity = len(os.listdir(data_directory))

    for filename in os.listdir(data_directory):
        if filename.endswith('.jpg'):

            image_path = os.path.join(data_directory, filename)
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": f"file://{image_path}"},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to(device)

            with torch.no_grad():
                generated_ids = model.generate(**inputs, max_new_tokens=64)
                generated_ids_trimmed = [
                    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )[0]

            quantity -= 1
            print(f"Pending to process {quantity}")
            
            output_text_parse = extract_coordinates(output_text)

            results_dict[filename] = {
                "Inference": output_text_parse
            }

            torch.cuda.empty_cache()

    print(results_dict)
    with open(INFERENCE_RESULTS, 'w') as json_file:
        json.dump(results_dict, json_file, indent=4)

In [8]:
PROCESS_IMAGES = time.time()
process_images(IMAGE_PATH)
print(f"Images processed in {time.time() - PROCESS_IMAGES} seconds")

Pending to process 422
Pending to process 421
Pending to process 420
Pending to process 419
Pending to process 418
Pending to process 417
Pending to process 416
Pending to process 415
Pending to process 414
Pending to process 413
Pending to process 412
Pending to process 411
Pending to process 410
Pending to process 409
Pending to process 408
Pending to process 407
Pending to process 406
Pending to process 405
Pending to process 404
Pending to process 403
Pending to process 402
Pending to process 401
Pending to process 400
Pending to process 399
Pending to process 398
Pending to process 397
Pending to process 396
Pending to process 395
Pending to process 394
Pending to process 393
Pending to process 392
Pending to process 391
Pending to process 390
Pending to process 389
Pending to process 388
Pending to process 387
Pending to process 386
Pending to process 385
Pending to process 384
Pending to process 383
Pending to process 382
Pending to process 381
Pending to process 380
Pending to 

OutOfMemoryError: CUDA out of memory. Tried to allocate 60.27 GiB. GPU 0 has a total capacity of 44.53 GiB of which 21.08 GiB is free. Process 16752 has 23.44 GiB memory in use. Of the allocated memory 22.49 GiB is allocated by PyTorch, and 450.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Classification

In [None]:
# All images have drones in the current dataset, so in this case the accuracy is accuracy = images_detect_drone * 100 / total_images


# Detection

In [None]:
# Compare both JSON files and see how good the inse