In [1]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
from diffusers import ZImagePipeline, FluxKontextInpaintPipeline, FluxPipeline
import ultralytics
from nunchaku import NunchakuZImageTransformer2DModel, NunchakuFluxTransformer2DModelV2
from nunchaku.utils import get_precision
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import os
import random
import pandas

In [2]:
# Constants
ZIT_RANK = 32
yolo_model_name = 'models/yolo26x-seg.pt'

prompts = [
    # "People walking in a spacious airport terminal",
    "Children playing in a park",
    "Patients sitting in a hospital waiting room",
    "Shoppers walking through wide supermarket aisles",
    # "Travelers standing in a spacious airport check-in queue",
    "People standing on a train station platform",
    "Commuters waiting at a city bus stop",
    "Pedestrians walking on a wide city sidewalk",
    "Guests standing in a spacious hotel lobby",
    "Visitors walking through a museum gallery",
    "Students walking in a university corridor",
    "Customers waiting in line at a bank",
    "Employees walking in a modern office lobby",
    "Passengers walking in a subway station",
    #  "Crowd walking in a conference center hall",
    "Shoppers walking through a wise supermarket aisle"
]

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}, dtype: {dtype}")

os.makedirs('data/backgrounds', exist_ok=True)
os.makedirs('data/images', exist_ok=True)

Using device: cuda, dtype: torch.bfloat16


In [21]:
# Utility functions
def display_image(img: Image.Image, title: str = "Image", box=None):
    img = img.copy()
    plt.figure(figsize=(6, 4))
    if box is not None:
        x1, y1, x2, y2 = box
        draw = ImageDraw.Draw(img)
        draw.rectangle([(x1, y1), (x2, y2)], outline='red', width=5)

    plt.imshow(img)
    plt.title(title)
    plt.axis("off")
    plt.show()


def display_masks(img: np.ndarray, mask_rgb: np.ndarray, overlay: np.ndarray):
    plt.figure(figsize=(14, 4))

    # Original image
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title("Original")
    plt.axis("off")

    # Mask visualization
    plt.subplot(1, 3, 2)
    plt.imshow(mask_rgb)
    plt.title("Person Instance Masks")
    plt.axis("off")

    # Mask overlay
    plt.subplot(1, 3, 3)
    plt.imshow(overlay)
    plt.title("Overlay")
    plt.axis("off")
    plt.tight_layout()
    plt.show()

In [22]:
# Background image generation with Z-Image-Turbo Nunchaku

def load_zit_pipeline():
    rank = ZIT_RANK
    transformer = NunchakuZImageTransformer2DModel.from_pretrained(
        f"nunchaku-tech/nunchaku-z-image-turbo/svdq-{get_precision()}_r{rank}-z-image-turbo.safetensors")

    pipe = ZImagePipeline.from_pretrained("Tongyi-MAI/Z-Image-Turbo", transformer=transformer, torch_dtype=dtype)
    pipe.enable_model_cpu_offload()

    return pipe


def zit_generate(pipe, prompt: str, width: int = 1280, height: int = 720, seed=None) -> Image.Image:
    if not seed:
        seed = random.randint(0, 2 ** 31 - 1)
    generator = torch.Generator(device=device).manual_seed(seed)
    img = pipe(
        prompt=prompt,
        num_inference_steps=8, guidance_scale=0.0,
        generator=generator, width=width, height=height,
    ).images[0]

    return img

In [23]:
def load_flux_pipeline():
    transformer = NunchakuFluxTransformer2DModelV2.from_pretrained(
        f"nunchaku-tech/nunchaku-flux.1-dev/svdq-{get_precision()}_r32-flux.1-dev.safetensors")
    pipe = FluxPipeline.from_pretrained(
        "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=torch.bfloat16)
    pipe.enable_model_cpu_offload()
    return pipe


def flux_generate(pipe: FluxPipeline, prompt: str, width: int = 1280, height: int = 720) -> Image.Image:
    image = pipe(prompt=prompt, guidance_scale=3.5, width=width, height=height).images[0]
    return image

In [24]:
def load_flux_kontext_pipeline():
    transformer = NunchakuFluxTransformer2DModelV2.from_pretrained(
        f"nunchaku-tech/nunchaku-flux.1-kontext-dev/svdq-{get_precision()}_r32-flux.1-kontext-dev.safetensors")

    pipe = FluxKontextInpaintPipeline.from_pretrained("black-forest-labs/FLUX.1-Kontext-dev", transformer=transformer,
                                                      torch_dtype=dtype)
    pipe.enable_model_cpu_offload()

    return pipe

In [25]:
def segment_persons(model, img: Image.Image, confidence: float = 0.25):
    img_np = np.array(img)
    h, w = img_np.shape[:2]

    res = model(img_np, conf=confidence, iou=0.5, verbose=False)[0]

    mask_rgb = np.zeros((h, w, 3), dtype=np.uint8)
    overlay = img_np.copy()
    box_list = []

    for i, cls in enumerate(res.boxes.cls.cpu().numpy()):
        if int(cls) == 0:  # person
            mask_data = res.masks.data[i].cpu().numpy()  # Get the raw mask data (at YOLO's inference resolution)
            resized_mask = cv2.resize(mask_data.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST)
            mask = resized_mask.astype(bool)

            color = np.random.randint(0, 256, 3)
            mask_rgb[mask] = color
            overlay[mask] = (0.5 * overlay[mask] + 0.5 * color).astype(np.uint8)
            box_list.append(res.boxes.xyxy[i].cpu().numpy().astype(int))

    return box_list, mask_rgb, overlay



In [26]:
def inpaint_image(pipe: FluxKontextInpaintPipeline, img: Image.Image, box: tuple, prompt: str,
                  neg: str = "") -> Image.Image:
    img_np = np.array(img)
    h, w = img_np.shape[:2]

    mask = np.zeros((h, w), dtype=np.uint8)
    x1, y1, x2, y2 = box
    mask[y1:y2, x1:x2] = 255  # White rectangle for inpainting area

    mask_pil = Image.fromarray(mask)

    seed = random.randint(0, 2 ** 31 - 1)
    generator = torch.Generator(device=device).manual_seed(seed)

    inpainted_img = pipe(
        prompt=prompt,
        image=img,
        mask_image=mask_pil,
        guidance_scale=2.5,
        generator=generator,
        strength=1.0).images[0]

    return inpainted_img

In [11]:
torch.cuda.empty_cache()
pipeline = load_zit_pipeline()

quantization_config: {'method': 'svdquant', 'weight': {'dtype': 'int4', 'scale_dtype': None, 'group_size': 64}, 'activation': {'dtype': 'int4', 'scale_dtype': None, 'group_size': 64}, 'rank': 32, 'skip_refiners': False}, rank=32, skip_refiners=False


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
prompt = random.choice(prompts)
image = zit_generate(pipeline, prompt, width=1280, height=720)
display_image(image, title=prompt)

In [None]:
boxes, masks, mask_overlay = segment_persons(ultralytics.YOLO(yolo_model_name), image, confidence=0.75)
display_masks(np.array(image), masks, mask_overlay)

In [None]:
torch.cuda.empty_cache()

inpaint_pipeline = load_flux_kontext_pipeline()

In [None]:
box = random.choice(boxes)
display_image(image, title=prompt, box=box)

In [None]:
inpaint_prompt = "Remove the person"
inpainted_image = inpaint_image(inpaint_pipeline, image, box, prompt=inpaint_prompt)
display_image(inpainted_image, title="After Removing Person", box=box)

In [None]:
inpaint_prompt2 = "Add a suitcase on the ground"
x1, y1, x2, y2 = box
box2 = (x1, int(y1 + 0.4 * abs(y2 - y1)), x2, y2)
inpainted_image2 = inpaint_image(inpaint_pipeline, inpainted_image, box2, prompt=inpaint_prompt2)
display_image(inpainted_image2, title="Inpainted Image", box=box2)

In [None]:
prompt_samples = {}
num_seeds = 5
seeds = [random.randint(0, 2 ** 31 - 1) for _ in range(num_seeds)]
for prompt in prompts:
    prompt_samples[prompt] = {}
    for seed in seeds:
        prompt_samples[prompt][seed] = zit_generate(pipeline, prompt, width=1280, height=720, seed=seed)

In [None]:
n_prompts = len(prompts)
n_seeds = 5

fig, axes = plt.subplots(n_prompts, n_seeds, figsize=(15, 3 * n_prompts))

for i, (prompt, images) in enumerate(prompt_samples.items()):
    for j, (seed, image) in enumerate(images.items()):
        ax = axes[i, j] if n_prompts > 1 else axes[0, j]
        ax.imshow(image)
        ax.axis('off')
        # Add title to first column
        if j == 0:
            ax.set_ylabel(prompt[:30] + '...', rotation=0, labelpad=50, va='center')

        # Add seed as column title
        if i == 0:
            ax.set_title(f'Seed: {seed}')

plt.tight_layout()
plt.savefig('grid.png')
plt.show()

In [None]:
plt.show()

In [12]:
total = len(prompts) * 15
counter = 0

with tqdm(total=total) as pbar:
    for _ in range(15):
        for p in prompts:
            img = zit_generate(pipeline, p, width=1280, height=720)
            img.save(f'data/backgrounds/{counter}.png')
            counter = counter + 1
            pbar.update(1)

  0%|          | 0/195 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
# Extract people bounding boxes from the backgrounds data
yolo_model = ultralytics.YOLO('models/yolo26x.pt')
box_results = yolo_model.predict(
    source=f'data/backgrounds/',
    project='data',
    name='people-box',
    save=True,
    imgsz=(1280, 720),
    iou=0.5,
    device='cuda',
    classes=[0],
    batch=4,
    save_txt=True,
    conf=0.65, )


image 1/195 C:\Projects\SusDetector2\dataset\backgrounds\0.png: 416x736 3 persons, 14.8ms
image 2/195 C:\Projects\SusDetector2\dataset\backgrounds\1.png: 416x736 5 persons, 14.8ms
image 3/195 C:\Projects\SusDetector2\dataset\backgrounds\10.png: 416x736 2 persons, 14.8ms
image 4/195 C:\Projects\SusDetector2\dataset\backgrounds\100.png: 416x736 7 persons, 14.8ms
image 5/195 C:\Projects\SusDetector2\dataset\backgrounds\101.png: 416x736 2 persons, 8.9ms
image 6/195 C:\Projects\SusDetector2\dataset\backgrounds\102.png: 416x736 13 persons, 8.9ms
image 7/195 C:\Projects\SusDetector2\dataset\backgrounds\103.png: 416x736 2 persons, 8.9ms
image 8/195 C:\Projects\SusDetector2\dataset\backgrounds\104.png: 416x736 4 persons, 8.9ms
image 9/195 C:\Projects\SusDetector2\dataset\backgrounds\105.png: 416x736 7 persons, 8.1ms
image 10/195 C:\Projects\SusDetector2\dataset\backgrounds\106.png: 416x736 3 persons, 8.1ms
image 11/195 C:\Projects\SusDetector2\dataset\backgrounds\107.png: 416x736 6 persons, 8.

In [17]:
# Extract people segmentation from the backgrounds data
yolo_model = ultralytics.YOLO('models/yolo26x-seg.pt')
seg_results = yolo_model.predict(
    source=f'data/backgrounds/',
    project='data',
    name='people-seg',
    save=True,
    imgsz=(1280, 720),
    iou=0.5,
    device='cuda',
    classes=[0],
    batch=4,
    save_txt=True,
    conf=0.65, )


image 1/195 C:\Projects\SusDetector2\dataset\backgrounds\0.png: 416x736 3 persons, 12.7ms
image 2/195 C:\Projects\SusDetector2\dataset\backgrounds\1.png: 416x736 5 persons, 12.7ms
image 3/195 C:\Projects\SusDetector2\dataset\backgrounds\10.png: 416x736 2 persons, 12.7ms
image 4/195 C:\Projects\SusDetector2\dataset\backgrounds\100.png: 416x736 8 persons, 12.7ms
image 5/195 C:\Projects\SusDetector2\dataset\backgrounds\101.png: 416x736 2 persons, 10.8ms
image 6/195 C:\Projects\SusDetector2\dataset\backgrounds\102.png: 416x736 12 persons, 10.8ms
image 7/195 C:\Projects\SusDetector2\dataset\backgrounds\103.png: 416x736 2 persons, 10.8ms
image 8/195 C:\Projects\SusDetector2\dataset\backgrounds\104.png: 416x736 4 persons, 10.8ms
image 9/195 C:\Projects\SusDetector2\dataset\backgrounds\105.png: 416x736 7 persons, 10.7ms
image 10/195 C:\Projects\SusDetector2\dataset\backgrounds\106.png: 416x736 3 persons, 10.7ms
image 11/195 C:\Projects\SusDetector2\dataset\backgrounds\107.png: 416x736 6 perso

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: ultralytics.engine.results.Masks object
 names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 

In [6]:
boxes_xywh = [result.boxes.xywh.cpu().numpy() for result in box_results]

In [7]:
rows = []
for i, boxes in enumerate(boxes_xywh):
    rows.append([{'image': i, 'x': box[0], 'y': box[1], 'w': box[2], 'h': box[3]} for box in boxes])
rows

SyntaxError: invalid syntax (5949444.py, line 3)

In [5]:
df = pd.DataFrame(rows, columns=['image','x', 'y', 'w', 'h'])

ValueError: 5 columns passed, passed data had 13 columns