In [7]:
# !pip uninstall -y opencv-python opencv-contrib-python
# !pip install opencv-contrib-python==4.7.0.72

[0mFound existing installation: opencv-contrib-python 4.11.0.86
Uninstalling opencv-contrib-python-4.11.0.86:
  Successfully uninstalled opencv-contrib-python-4.11.0.86
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting opencv-contrib-python==4.7.0.72
  Obtaining dependency information for opencv-contrib-python==4.7.0.72 from https://files.pythonhosted.org/packages/fb/89/8370c6864e518be9ca1b54a19b5daf398f4943041e1283ffa7ba0c66c0bd/opencv_contrib_python-4.7.0.72-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading opencv_contrib_python-4.7.0.72-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading opencv_contrib_python-4.7.0.72-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-contrib-python
Succe

In [24]:
import torch
import cv2
import supervision as sv
from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor
import numpy as np
import os

# 모델 가중치 및 설정 경로 (Dockerfile 기준)
GROUNDING_DINO_CONFIG_PATH = "/app/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT_PATH = "/app/groundingdino/gdino_checkpoints/groundingdino_swint_ogc.pth"
SAM_CHECKPOINT_PATH = "/app/SAM/sam_vit_h_4b8939.pth"
SAM_ENCODER_VERSION = "vit_h"

# run_grounded_sam.py
def grounded_sam(img_path = "/app/images/KakaoTalk_20250520_140652269.jpg", text_prompt = "bus.wheel.window.", BOX_THRESHOLD = 0.35, TEXT_THRESHOLD = 0.25, output_dir = "outputs", save_img=False):
    # --- 설정 값 ---

    # 출력 폴더 생성
    os.makedirs(output_dir, exist_ok=True)
    
    # --- 모델 로딩 ---
    DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {DEVICE}")

    # GroundingDINO 모델 로드
    grounding_dino_model = Model(
        model_config_path=GROUNDING_DINO_CONFIG_PATH, 
        model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
        device=str(DEVICE)
    )

    # SAM 모델 로드
    sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE)
    sam_predictor = SamPredictor(sam)

    # --- 추론 실행 ---
    # 이미지 불러오기
    image_bgr = cv2.imread(img_path)
    if image_bgr is None:
        print(f"Error: 이미지를 읽을 수 없습니다. 경로를 확인하세요: {img_path}")
        return
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

    # 1. Grounding DINO로 Bounding Box 탐지
    detections = grounding_dino_model.predict_with_classes(
        image=image_rgb,
        classes=[c.strip() for c in text_prompt.split('.')],
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD
    )
    print(f"GroundingDINO found {len(detections)} objects.")

    # 2. 탐지된 BBox를 SAM의 입력으로 사용하여 Segmentation Mask 생성
    sam_predictor.set_image(image_rgb)
    boxes_for_sam = detections.xyxy
    
    if len(boxes_for_sam) > 0:
        masks, _, _ = sam_predictor.predict_torch(
            point_coords=None,
            point_labels=None,
            boxes=torch.tensor(boxes_for_sam, device=sam_predictor.device),
            multimask_output=False,
        )
        detections.mask = masks.cpu().numpy().squeeze(1)
    else:
        # 객체가 탐지되지 않으면 마스크를 빈 배열로 설정
        detections.mask = np.empty((0, *image_bgr.shape[:2]), dtype=bool)

    # --- 결과 시각화 및 저장 ---
    # Annotator 생성
    box_annotator = sv.BoxAnnotator()
    mask_annotator = sv.MaskAnnotator()
    label_annotator = sv.LabelAnnotator()

    # 클래스 레이블 생성
    labels = [
        f"{text_prompt.split('.')[class_id]} {confidence:0.2f}"
        for class_id, confidence in zip(detections.class_id, detections.confidence)
    ]

    annotated_image = None
    if save_img==True:
        # 원본 이미지에 마스크와 BBox 그리기
        annotated_image = mask_annotator.annotate(scene=image_bgr.copy(), detections=detections)
        annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
        
        # 결과 이미지 저장
        output_filename = os.path.basename(img_path)
        output_path = os.path.join(output_dir, f"result_{output_filename}")
        cv2.imwrite(output_path, annotated_image)
        
        print(f"결과 이미지가 다음 경로에 저장되었습니다: {output_path}")

    return {
        'boxes': boxes_for_sam,
        'masks': masks,
        'class_ids': detections.class_id,
        'confidences': detections.confidence,
        'labels': labels,
        'annotated_image': annotated_image,
        'detections': detections
    }


In [25]:
result_grounded_sam = grounded_sam(img_path = "/app/images/KakaoTalk_20250520_140652269.jpg", text_prompt = "bus.wheel.window.", BOX_THRESHOLD = 0.35, TEXT_THRESHOLD = 0.25, output_dir = "outputs", save_img=True)

Using device: cuda:0
final text_encoder_type: bert-base-uncased




GroundingDINO found 3 objects.
결과 이미지가 다음 경로에 저장되었습니다: outputs/result_KakaoTalk_20250520_140652269.jpg


In [27]:
len(result_grounded_sam['boxes']), len(result_grounded_sam['masks']), len(result_grounded_sam['class_ids']), len(result_grounded_sam['confidences']), len(result_grounded_sam['labels']), len(result_grounded_sam['annotated_image']), len(result_grounded_sam['detections'])

(3, 3, 3, 3, 3, 3024, 3)

In [30]:
def visualize_individual_detections(detection_results: dict, output_dir: str, original_image_path: str):
    """
    grounded_sam 함수에서 반환된 각 탐지 결과를 개별 이미지로 시각화하여 저장합니다.

    Args:
        detection_results (dict): grounded_sam 함수의 반환 값 (boxes, masks, class_ids, etc.).
        output_dir (str): 개별 결과 이미지를 저장할 디렉토리 경로입니다.
        original_image_path (str): 원본 이미지 파일의 경로입니다.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # 원본 이미지 로드 (grounded_sam에서 반환된 BGR 이미지를 사용하는 것이 가장 좋습니다)
    image_bgr = detection_results.get('original_image_bgr')
    if image_bgr is None:
        print("Error: 원본 BGR 이미지를 찾을 수 없습니다. 다시 로드합니다.")
        image_bgr = cv2.imread(original_image_path)
        if image_bgr is None:
            print(f"Error: 이미지를 읽을 수 없습니다. 경로를 확인하세요: {original_image_path}")
            return

    detections = detection_results.get('detections_object')
    labels = detection_results.get('labels')

    if detections is None or len(detections) == 0:
        print("탐지된 객체가 없습니다. 개별 이미지를 생성하지 않습니다.")
        return

    box_annotator = sv.BoxAnnotator()
    mask_annotator = sv.MaskAnnotator()

    base_filename = os.path.splitext(os.path.basename(original_image_path))[0]

    for i in range(len(detections)):
        # SuperGradients Detections 객체는 인덱싱을 통해 개별 탐지를 추출할 수 있습니다.
        single_detection = detections[i]
        single_label = [labels[i]] # Label needs to be a list for BoxAnnotator

        # 원본 이미지 복사본에 현재 탐지 결과만 그립니다.
        annotated_single_image = image_bgr.copy()
        annotated_single_image = mask_annotator.annotate(scene=annotated_single_image, detections=single_detection)
        annotated_single_image = box_annotator.annotate(scene=annotated_single_image, detections=single_detection, labels=single_label)

        # 개별 결과 이미지 저장
        output_path = os.path.join(output_dir, f"{base_filename}_detection_{i+1}_{single_label[0].split(' ')[0]}.jpg")
        cv2.imwrite(output_path, annotated_single_image)
        print(f"개별 탐지 결과 이미지가 저장되었습니다: {output_path}")

In [31]:
IMAGE_PATH = "/app/images/KakaoTalk_20250520_140652269.jpg"
OUTPUT_BASE_DIR = "outputs"
INDIVIDUAL_OUTPUT_DIR = os.path.join(OUTPUT_BASE_DIR, "individual_detections")

if result_grounded_sam is not None:
    print("\n--- 모든 탐지 결과 요약 ---")
    print(f"탐지된 객체 수: {len(result_grounded_sam['boxes'])}")
    for i, label in enumerate(result_grounded_sam['labels']):
        print(f"  객체 {i+1}: {label} (Box: {result_grounded_sam['boxes'][i]})")

    # 2. 각 탐지별 이미지 시각화 함수 실행
    visualize_individual_detections(result_grounded_sam, INDIVIDUAL_OUTPUT_DIR, IMAGE_PATH)
else:
    print("이미지 처리 실패 또는 탐지된 객체 없음.")


--- 모든 탐지 결과 요약 ---
탐지된 객체 수: 3
  객체 1: bus 0.94 (Box: [ 185.13367 1005.3669  3975.4155  1982.2109 ])
  객체 2: wheel 0.66 (Box: [ 962.0431 1660.745  1304.6151 1980.7504])
  객체 3: wheel 0.64 (Box: [2937.8667 1666.0944 3274.1938 1982.8295])
Error: 원본 BGR 이미지를 찾을 수 없습니다. 다시 로드합니다.
탐지된 객체가 없습니다. 개별 이미지를 생성하지 않습니다.


In [None]:
import torch
import cv2
import supervision as sv
from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor
import numpy as np
import os

def main():
    # --- 설정 값 ---
    # 입력/출력 경로
    IMAGE_PATH = "images/image1.jpg"  # 분석할 이미지 경로
    OUTPUT_DIR = "outputs"
    
    # 모델 파라미터
    TEXT_PROMPT = "person, dog"  # 탐지할 객체
    BOX_THRESHOLD = 0.35
    TEXT_THRESHOLD = 0.25

    # 모델 가중치 및 설정 경로 (Dockerfile 기준)
    GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
    GROUNDING_DINO_CHECKPOINT_PATH = "/app/weights/groundingdino_swint_ogc.pth"
    SAM_CHECKPOINT_PATH = "/app/weights/sam_vit_h_4b8939.pth"
    SAM_ENCODER_VERSION = "vit_h"
    
    # 출력 폴더 생성
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # --- 모델 로딩 ---
    DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {DEVICE}")

    # GroundingDINO 모델 로드
    grounding_dino_model = Model(
        model_config_path=GROUNDING_DINO_CONFIG_PATH, 
        model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
        device=str(DEVICE)
    )

    # SAM 모델 로드
    sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE)
    sam_predictor = SamPredictor(sam)

    # --- 추론 실행 ---
    # 이미지 불러오기
    image_bgr = cv2.imread(IMAGE_PATH)
    if image_bgr is None:
        print(f"Error: 이미지를 읽을 수 없습니다. 경로를 확인하세요: {IMAGE_PATH}")
        return
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

    # 1. Grounding DINO로 Bounding Box 탐지
    detections = grounding_dino_model.predict_with_classes(
        image=image_rgb,
        classes=[c.strip() for c in TEXT_PROMPT.split(',')],
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD
    )
    print(f"GroundingDINO found {len(detections)} objects.")

    # 2. 탐지된 BBox를 SAM의 입력으로 사용하여 Segmentation Mask 생성
    sam_predictor.set_image(image_rgb)
    boxes_for_sam = detections.xyxy
    
    if len(boxes_for_sam) > 0:
        masks, _, _ = sam_predictor.predict_torch(
            point_coords=None,
            point_labels=None,
            boxes=torch.tensor(boxes_for_sam, device=sam_predictor.device),
            multimask_output=False,
        )
        detections.mask = masks.cpu().numpy().squeeze(1)
    else:
        # 객체가 탐지되지 않으면 마스크를 빈 배열로 설정
        detections.mask = np.empty((0, *image_bgr.shape[:2]), dtype=bool)

    # --- 결과 시각화 및 저장 ---
    # Annotator 생성
    box_annotator = sv.BoxAnnotator()
    mask_annotator = sv.MaskAnnotator()
    label_annotator = sv.LabelAnnotator()

    # 클래스 레이블 생성
    labels = [
        f"{TEXT_PROMPT.split(',')[class_id]} {confidence:0.2f}"
        for class_id, confidence in zip(detections.class_id, detections.confidence)
    ]

    # 원본 이미지에 마스크와 BBox 그리기
    annotated_image = mask_annotator.annotate(scene=image_bgr.copy(), detections=detections)
    annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
    
    # 결과 이미지 저장
    output_filename = os.path.basename(IMAGE_PATH)
    output_path = os.path.join(OUTPUT_DIR, f"result_{output_filename}")
    cv2.imwrite(output_path, annotated_image)
    
    print(f"결과 이미지가 다음 경로에 저장되었습니다: {output_path}")

if __name__ == "__main__":
    main()

Using device: cuda:0
final text_encoder_type: bert-base-uncased




GroundingDINO found 3 objects.


TypeError: list indices must be integers or slices, not NoneType

In [17]:
import os
import torch
import cv2
import numpy as np
import supervision as sv
from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor

# Default model paths (modify these if your paths differ)
DEFAULT_GDINO_CONFIG = "/app/groundingdino/config/GroundingDINO_SwinT_OGC.py"
DEFAULT_GDINO_CHECKPOINT = "/app/groundingdino/gdino_checkpoints/groundingdino_swint_ogc.pth"
DEFAULT_SAM_CHECKPOINT = "/app/SAM/sam_vit_h_4b8939.pth"
DEFAULT_SAM_VERSION = "vit_h"


def run_grounded_sam(
    image_path: str,
    text_prompt: str,
    grounding_dino_config_path: str = DEFAULT_GDINO_CONFIG,
    grounding_dino_checkpoint_path: str = DEFAULT_GDINO_CHECKPOINT,
    sam_checkpoint_path: str = DEFAULT_SAM_CHECKPOINT,
    sam_encoder_version: str = DEFAULT_SAM_VERSION,
    box_threshold: float = 0.35,
    text_threshold: float = 0.25,
    device: str = None
) -> dict:
    """
    Run GroundingDINO + SAM segmentation and bounding box detection on an image.

    Args:
        image_path: Path to the input image file.
        text_prompt: Comma-separated class names to detect (e.g., "bus, wheel, window").
        grounding_dino_config_path: Path to GroundingDINO config .py file.
        grounding_dino_checkpoint_path: Path to GroundingDINO .pth checkpoint.
        sam_checkpoint_path: Path to SAM .pth checkpoint.
        sam_encoder_version: SAM encoder version key (e.g., 'vit_h', 'vit_l', 'vit_b').
        box_threshold: Confidence threshold for bounding boxes.
        text_threshold: Text matching threshold in GroundingDINO.
        device: Torch device string ('cuda:0' or 'cpu'). If None, auto-selects.

    Returns:
        dict with the following keys:
            'boxes'           : np.ndarray of shape (N, 4) with [x1, y1, x2, y2]
            'masks'           : np.ndarray of shape (N, H, W) boolean masks
            'class_ids'       : list of int class indices
            'confidences'     : list of float confidences
            'labels'          : list of str labels with confidence
            'annotated_image' : BGR image with drawn boxes & masks
            'detections'      : original Detections object for full access
    """
    # Device selection
    if device is None:
        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    print(f"Using device: {device}")  # GPU or CPU 확인용 출력

    # Load GroundingDINO
    grounding_model = Model(
        model_config_path=grounding_dino_config_path,
        model_checkpoint_path=grounding_dino_checkpoint_path,
        device=str(device)
    )

    # Load SAM
    sam = sam_model_registry[sam_encoder_version](checkpoint=sam_checkpoint_path).to(device=device)
    sam_predictor = SamPredictor(sam)

    # Read image
    image_bgr = cv2.imread(image_path)
    if image_bgr is None:
        raise FileNotFoundError(f"Cannot read image: {image_path}")
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

    # GroundingDINO inference
    classes = [c.strip() for c in text_prompt.split(',')]
    detections = grounding_model.predict_with_classes(
        image=image_rgb,
        classes=classes,
        box_threshold=box_threshold,
        text_threshold=text_threshold
    )

    # SAM segmentation
    sam_predictor.set_image(image_rgb)
    boxes_for_sam = detections.xyxy
    if len(boxes_for_sam) > 0:
        masks, _, _ = sam_predictor.predict_torch(
            point_coords=None,
            point_labels=None,
            boxes=torch.tensor(boxes_for_sam, device=sam_predictor.device),
            multimask_output=False
        )
        masks = masks.cpu().numpy().squeeze(1)
    else:
        masks = np.empty((0, *image_bgr.shape[:2]), dtype=bool)
    detections.mask = masks

    # Visualization
    mask_annotator = sv.MaskAnnotator()
    box_annotator = sv.BoxAnnotator()
    annotated = mask_annotator.annotate(scene=image_bgr.copy(), detections=detections)
    annotated = box_annotator.annotate(scene=annotated, detections=detections)

    # Build labels
    labels = [f"{classes[cid]} {conf:0.2f}" for cid, conf in zip(detections.class_id, detections.confidence)]

    return {
        'boxes': detections.xyxy,
        'masks': masks,
        'class_ids': detections.class_id,
        'confidences': detections.confidence,
        'labels': labels,
        'annotated_image': annotated,
        'detections': detections
    }


In [18]:
#Example usage (in Jupyter or script):
# from grounded_sam_utils import run_grounded_sam

result = run_grounded_sam(
    image_path="/app/images/KakaoTalk_20250520_140652269.jpg",
    text_prompt="bus. wheel. window."
)
cv2.imwrite("app/notebooks/outputs/out.png", result['annotated_image'])

Using device: cuda:0
final text_encoder_type: bert-base-uncased




TypeError: '<' not supported between instances of 'NoneType' and 'int'

In [20]:
import torch
import cv2
import supervision as sv
from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor
import numpy as np
import os

def process_image_with_grounded_sam(image_path: str, text_prompt: str):
    """
    Grounding DINO를 사용하여 객체를 탐지하고 SAM을 사용하여 분할 마스크를 생성합니다.

    Args:
        image_path (str): 입력 이미지 파일의 경로입니다.
        text_prompt (str): 탐지할 객체를 쉼표로 구분한 문자열입니다 (예: "bus, wheel, window").

    Returns:
        tuple: 다음을 포함하는 튜플입니다:
            - boxes (list[list[float]]): 각 탐지된 객체의 바운딩 박스 좌표 목록 (x1, y1, x2, y2).
            - masks (list[np.ndarray]): 각 탐지된 객체의 이진 분할 마스크 목록.
            - labels (list[str]): 각 탐지 결과에 대한 "클래스_이름 신뢰도_점수" 형식의 레이블 목록.
            - class_ids (list[int]): 각 탐지 결과에 대한 클래스 ID 목록.
            - confidences (list[float]): 각 탐지 결과에 대한 신뢰도 점수 목록.
    """
    # --- 설정 값 ---
    # 모델 파라미터
    BOX_THRESHOLD = 0.35
    TEXT_THRESHOLD = 0.25

    # 모델 가중치 및 설정 경로 (Dockerfile 기준)
    GROUNDING_DINO_CONFIG_PATH = "/app/groundingdino/config/GroundingDINO_SwinT_OGC.py"
    GROUNDING_DINO_CHECKPOINT_PATH = "/app/groundingdino/gdino_checkpoints/groundingdino_swint_ogc.pth"
    SAM_CHECKPOINT_PATH = "/app/SAM/sam_vit_h_4b8939.pth"
    SAM_ENCODER_VERSION = "vit_h"
    
    # --- 모델 로딩 ---
    DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"사용 장치: {DEVICE}")

    # GroundingDINO 모델 로드
    grounding_dino_model = Model(
        model_config_path=GROUNDING_DINO_CONFIG_PATH, 
        model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
        device=str(DEVICE)
    )

    # SAM 모델 로드
    sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=DEVICE)
    sam_predictor = SamPredictor(sam)

    # --- 추론 실행 ---
    # 이미지 불러오기
    image_bgr = cv2.imread(image_path)
    if image_bgr is None:
        print(f"오류: 이미지를 읽을 수 없습니다. 경로를 확인하세요: {image_path}")
        return [], [], [], [], [] # 이미지 로드 실패 시 빈 리스트 반환
    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

    # 1. Grounding DINO로 Bounding Box 탐지
    classes_list = [c.strip() for c in text_prompt.split(',')]
    detections = grounding_dino_model.predict_with_classes(
        image=image_rgb,
        classes=classes_list,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD
    )
    print(f"GroundingDINO에서 {len(detections)}개의 객체를 찾았습니다.")

    # 2. 탐지된 BBox를 SAM의 입력으로 사용하여 Segmentation Mask 생성
    sam_predictor.set_image(image_rgb)
    boxes_for_sam = detections.xyxy
    
    if len(boxes_for_sam) > 0:
        masks, _, _ = sam_predictor.predict_torch(
            point_coords=None,
            point_labels=None,
            boxes=torch.tensor(boxes_for_sam, device=sam_predictor.device),
            multimask_output=False,
        )
        detections.mask = masks.cpu().numpy().squeeze(1)
    else:
        # 객체가 탐지되지 않으면 마스크를 빈 배열로 설정
        detections.mask = np.empty((0, *image_bgr.shape[:2]), dtype=bool)

    # 반환할 값 준비
    boxes = detections.xyxy.tolist() if detections.xyxy is not None else []
    masks = detections.mask.tolist() if detections.mask is not None else []
    confidences = detections.confidence.tolist() if detections.confidence is not None else []
    class_ids = detections.class_id.tolist() if detections.class_id is not None else []

    # 출력용 레이블 생성
    labels = []
    for class_id, confidence in zip(class_ids, confidences):
        if class_id < len(classes_list): # class_id가 classes_list 범위 내에 있는지 확인
            labels.append(f"{classes_list[class_id]} {confidence:0.2f}")
        else:
            labels.append(f"UNKNOWN {confidence:0.2f}") # 범위를 벗어난 class_id에 대한 대체

    # --- 결과 시각화 및 저장 (선택 사항 - 디버깅/확인용) ---
    # 반환된 데이터만 필요하다면 이 섹션을 주석 처리하거나 제거할 수 있습니다.
    OUTPUT_DIR = "outputs"
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    box_annotator = sv.BoxAnnotator()
    mask_annotator = sv.MaskAnnotator()
    
    annotated_image = mask_annotator.annotate(scene=image_bgr.copy(), detections=detections)
    annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
    
    output_filename = os.path.basename(image_path)
    output_path = os.path.join(OUTPUT_DIR, f"result_{output_filename}")
    cv2.imwrite(output_path, annotated_image)
    print(f"결과 이미지가 다음 경로에 저장되었습니다: {output_path}")

    return boxes, masks, labels, class_ids, confidences

if __name__ == "__main__":
    # 사용 예시:
    IMAGE_PATH = "/app/images/KakaoTalk_20250520_140652269.jpg"
    TEXT_PROMPT = "bus. wheel. window."

    detected_boxes, detected_masks, detected_labels, detected_class_ids, detected_confidences = \
        process_image_with_grounded_sam(IMAGE_PATH, TEXT_PROMPT)

    print("\n--- 탐지 결과 ---")
    print(f"{len(detected_boxes)}개의 객체를 탐지했습니다.")
    for i, (box, mask, label, class_id, confidence) in enumerate(zip(detected_boxes, detected_masks, detected_labels, detected_class_ids, detected_confidences)):
        print(f"객체 {i+1}:")
        print(f"  레이블: {label}")
        print(f"  바운딩 박스: {box}")
        print(f"  마스크 형태: {mask.shape if isinstance(mask, np.ndarray) else 'N/A'}")
        print(f"  클래스 ID: {class_id}")
        print(f"  신뢰도: {confidence:.2f}")

사용 장치: cuda:0
final text_encoder_type: bert-base-uncased




GroundingDINO에서 3개의 객체를 찾았습니다.


TypeError: '<' not supported between instances of 'NoneType' and 'int'