# Install dependencies

In [1]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-90vnkfq6
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-90vnkfq6

  Resolved https://github.com/huggingface/transformers to commit 8bd2b1e8c23234cd607ca8d63f53c1edfea27462
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
!pip install qwen-vl-utils



In [3]:
!pip install torchvision



In [4]:
!pip install accelerate



In [1]:
import torch
import os
import json
import time
import re
from PIL import Image, ImageDraw
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from collections import defaultdict


In [3]:
START = time.time()
LOCAL_RUN = False

if LOCAL_RUN:
    IMAGE_PATH = 'images/'
    INFERENCE_RESULTS = 'output/inference_results.json'
else:
    IMAGE_PATH = 'vlm-drones/images/'
    INFERENCE_RESULTS = 'vlm-drones/output/inference_results.json'

In [4]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(device)

cuda


# Aux functions

In [5]:
def draw_bounding_box(image_path, bbox):
    image = Image.open(image_path)

    img_width, img_height = image.size

    center_x = bbox[0] * img_width
    center_y = bbox[1] * img_height
    width = bbox[2] * img_width
    height = bbox[3] * img_height

    top_left_x = int(center_x - (width / 2))
    top_left_y = int(center_y - (height / 2))
    bottom_right_x = int(center_x + (width / 2))
    bottom_right_y = int(center_y + (height / 2))

    draw = ImageDraw.Draw(image)
    draw.rectangle([top_left_x, top_left_y, bottom_right_x, bottom_right_y], outline="red", width=3)

    return image

# Preprocess images

Check that all images have the same size, in this case 1024X1024.

In [9]:
image_sizes = []

for filename in os.listdir(IMAGE_PATH):
    filepath = os.path.join(IMAGE_PATH, filename)
    if os.path.isfile(filepath):
        try:
            with Image.open(filepath) as img:
                width, height = img.size
                image_sizes.append((filename, (width, height)))
        except Exception as e:
            print(f"Skipping file {filename}: {e}")

size_to_images = defaultdict(list)

for filename, size in image_sizes:
    size_to_images[size].append(filename)

for size, files in size_to_images.items():
    if len(files) > 1:
        print(f"Size {size} has {len(files)} images: {files}")
    else:
        print(f"Size {size} has 1 image: {files}")

Skipping file coordinates.json: cannot identify image file 'images/coordinates.json'
Size (1024, 1024) has 403 images: ['img1005997.jpg', 'img1008257.jpg', 'img1008243.jpg', 'img1006285.jpg', 'img1005798.jpg', 'img1009834.jpg', 'img1048459.jpg', 'img1049747.jpg', 'img1008684.jpg', 'img1008451.jpg', 'img1008337.jpg', 'img1005834.jpg', 'img1005809.jpg', 'img1008450.jpg', 'img1006455.jpg', 'img1008685.jpg', 'img1008875.jpg', 'img1006131.jpg', 'img1048464.jpg', 'img1049746.jpg', 'img1048458.jpg', 'img1008281.jpg', 'img1005969.jpg', 'img1005766.jpg', 'img1008242.jpg', 'img1008256.jpg', 'img1006247.jpg', 'img1008268.jpg', 'img1006245.jpg', 'img1008240.jpg', 'img1005764.jpg', 'img1005770.jpg', 'img1008254.jpg', 'img1008283.jpg', 'img1049744.jpg', 'img1006047.jpg', 'img1049750.jpg', 'img1006127.jpg', 'img1008678.jpg', 'img1008888.jpg', 'img1008877.jpg', 'img1008687.jpg', 'img1006457.jpg', 'img1008452.jpg', 'img1008446.jpg', 'img1005837.jpg', 'img1008447.jpg', 'img1008453.jpg', 'img1008309.jpg'

Resize all images to 1024x1024.

In [32]:
with open("images/coordinates.json", "r") as file:
    coordinates = json.load(file)

for image_name, bbox in coordinates.items():
    
    image_path = os.path.join(IMAGE_PATH, image_name)
    image = Image.open(image_path)
    
    resized_image = image.resize((1024, 1024), Image.Resampling.LANCZOS)
    
    resized_image_path = os.path.join(IMAGE_PATH, image_name)
    resized_image.save(resized_image_path)

Images resized. No bounding box adjustments were necessary since they are in normalized units.


# Load model

In [6]:
LOAD_MODEL = time.time()
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
print(f"Model loaded in {time.time() - LOAD_MODEL} seconds")

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model loaded in 74.54133558273315 seconds


In [7]:
LOAD_PROCESSOR = time.time()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print(f"Processor loaded in {time.time() - LOAD_PROCESSOR} seconds")

Processor loaded in 0.5134637355804443 seconds


In [8]:
def extract_coordinates(output_text):
    coord_pattern = r'\(([\d\.\,\s]+)\)'
    match = re.search(coord_pattern, output_text)
    
    if match:

        coordinates = match.group(1).split(',')
        coordinates = list(map(float, coordinates))
        
        if len(coordinates) == 4:
            return coordinates
        else:
            return 0
    else:
        return 0

# Inference

In [9]:
def process_images(data_directory):
    prompt = "Detect drones in the image. If a drone is detected, return only the bounding box coordinates normalized between 0 and 1, in the format (x, y, w, h), where (x, y) is the top-left corner of the bounding box relative to the image dimensions, and w and h are the width and height relative to the image dimensions. No other text or information; only the coordinates."
    results_dict = {}
    quantity = len(os.listdir(data_directory))

    for filename in os.listdir(data_directory):
        if filename.endswith('.jpg'):

            image_path = os.path.join(data_directory, filename)

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": f"file://{image_path}"},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to(device)

            with torch.no_grad():
                generated_ids = model.generate(**inputs, max_new_tokens=64)
                generated_ids_trimmed = [
                    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )[0]

            quantity -= 1
            print(f"Pending to process {quantity}")
            
            output_text_parse = extract_coordinates(output_text)

            results_dict[filename] = output_text_parse

            torch.cuda.empty_cache()

    print(results_dict)
    with open(INFERENCE_RESULTS, 'w') as json_file:
        json.dump(results_dict, json_file, indent=4)

In [10]:
PROCESS_IMAGES = time.time()
process_images(IMAGE_PATH)
print(f"Images processed in {time.time() - PROCESS_IMAGES} seconds")

Pending to process 403
Pending to process 402
Pending to process 401
Pending to process 400
Pending to process 399
Pending to process 398
Pending to process 397
Pending to process 396
Pending to process 395
Pending to process 394
Pending to process 393
Pending to process 392
Pending to process 391
Pending to process 390
Pending to process 389
Pending to process 388
Pending to process 387
Pending to process 386
Pending to process 385
Pending to process 384
Pending to process 383
Pending to process 382
Pending to process 381
Pending to process 380
Pending to process 379
Pending to process 378
Pending to process 377
Pending to process 376
Pending to process 375
Pending to process 374
Pending to process 373
Pending to process 372
Pending to process 371
Pending to process 370
Pending to process 369
Pending to process 368
Pending to process 367
Pending to process 366
Pending to process 365
Pending to process 364
Pending to process 363
Pending to process 362
Pending to process 361
Pending to 

# Classification

In [14]:
def calculate_accuracy(json_path):

    with open(json_path, 'r') as file:
        data = json.load(file)

    total_images = 0
    images_with_bounding_box = 0


    for img_name, result in data.items():
        total_images += 1

        if isinstance(result, list) and len(result) == 4:
            images_with_bounding_box += 1


    accuracy = images_with_bounding_box / total_images if total_images > 0 else 0


    print(f"Total Images: {total_images}")
    print(f"Images with Detections (Bounding Box): {images_with_bounding_box}")
    print(f"Accuracy: {accuracy:.2%}")

    return accuracy


calculate_accuracy('output/inference_results.json')

Total Images: 403
Images with Detections (Bounding Box): 326
Accuracy: 80.89%


0.8089330024813896

# Detection

In [None]:
# Compare both JSON files and see how good the inse