In [1]:
import cv2
import numpy as np
import torch

from PIL import Image
from models.gdino import GDINO
from models.llama import Llama
from visualizer import Visualizer

from sam2.build_sam import build_sam2_object_tracker
torch_dtype=torch.float16

In [2]:
# Set SAM2 Configuration
NUM_OBJECTS = 1
SAM_CHECKPOINT_FILEPATH = "./checkpoints/sam2.1_hiera_tiny.pt"
SAM_CONFIG_FILEPATH = "./configs/samurai/sam2.1_hiera_t.yaml"
# SAM_CONFIG_FILEPATH = "./configs/sam2.1/sam2.1_hiera_t.yaml"
DEVICE = 'cuda:0'

parameter를 이용해 hugging face로부터 모델을 불러올 수 있습니다.

In [3]:
sam = build_sam2_object_tracker(num_objects=NUM_OBJECTS,
                                config_file=SAM_CONFIG_FILEPATH,
                                ckpt_path=SAM_CHECKPOINT_FILEPATH,
                                device=DEVICE,
                                verbose=False
                                )
gdino = GDINO()
gdino.build_model()

llama = Llama()
llama.build_model()

In [4]:
# Open Video Stream
# video_stream = cv2.VideoCapture(VIDEO_STREAM)
video_stream = cv2.VideoCapture(0)

video_height = int(video_stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
video_width = int(video_stream.get(cv2.CAP_PROP_FRAME_WIDTH))

# For real-time visualization
visualizer = Visualizer(video_width=video_width, video_height=video_height)

In [5]:
def get_bbox(images_pil, texts_prompt, box_threshold, text_threshold):
    gdino_results = gdino.predict(images_pil, [texts_prompt], box_threshold, text_threshold)
    sam_boxes = []
    sam_indices = []
    for idx, result in enumerate(gdino_results):
        result = {k: (v.cpu().numpy() if hasattr(v, "numpy") else v) for k, v in result.items()}
        processed_result = {
            **result,
            "masks": [],
            "mask_scores": [],
        }

        sam_boxes.append(processed_result["boxes"])
        sam_indices.append(idx)

    return sam_boxes

아래는 query의 예시입니다.   
query = "I am dehydrated"   
query = "I am thirsty"   
query = "I want to read"   
query = "I am bored"   
query = "I need a tool for writing"   
query = "I have to write something down"   
query = "I have to call him"   

In [6]:
text_prompt = llama.get_response(input("What are you looking for? "))

first_frame = True
with torch.inference_mode(), torch.autocast('cuda:0', dtype=torch.bfloat16):
    while video_stream.isOpened():
        ret, frame = video_stream.read()
        if not ret:
            break

        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if first_frame:
            image = Image.fromarray(img)
            bbox = get_bbox([image], text_prompt, 0.3, 0.25)
            xyxy = bbox[0][0]
            bbox = [[xyxy[0], xyxy[1]], [xyxy[2], xyxy[3]]]
            bbox = np.array(bbox, dtype=np.float32)
            sam_out = sam.track_new_object(img=img,
                                           box=bbox
                                           )
            
            first_frame = False
            
        else:
            sam_out = sam.track_all_objects(img=img)
        
        ret, frame = video_stream.read()
        visualizer.add_frame(frame=frame, mask=sam_out['pred_masks'])

What are you looking for? bottle


  mask = torch.tensor(mask, device='cpu')


KeyboardInterrupt: 

In [7]:
video_stream.release()
cv2.destroyAllWindows()