In [27]:
import supervision as sv
import numpy as np
from pathlib import Path
import cv2

In [28]:

mask_annotator = sv.MaskAnnotator(
    color=sv.ColorPalette.from_hex(['#6DE1D2', '#FFD63A', '#FFA955', '#F75A5A']),
    color_lookup= sv.ColorLookup.INDEX,
    opacity=0.5
)


In [29]:
SOURCE_VIDEO = Path("./videos/psm_live.mp4")
SOURCE_FRAMES = Path("./videos/psm_live_frames")
SCALE_FACTOR = 1
TARGET_VIDEO = Path("./videos/psm_live_pred.mp4")

# 入力動画の情報をコピー
video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO)
video_info.width = int(video_info.width * SCALE_FACTOR)
video_info.height = int(video_info.height * SCALE_FACTOR)

# 順番に処理し、1枚ずつ書き込む
frame_paths = sorted(sv.list_files_with_extensions(SOURCE_FRAMES.as_posix(), extensions=["jpeg"]))

In [4]:
frames_generator = sv.get_video_frames_generator(SOURCE_VIDEO)
images_sink = sv.ImageSink(
    target_dir_path=SOURCE_FRAMES.as_posix(),
    overwrite=True,
    image_name_pattern="{:05d}.jpeg"
)

with images_sink:
    for frame in frames_generator:
        frame = sv.scale_image(frame, SCALE_FACTOR)
        images_sink.save_image(frame)

In [30]:
from sam2.build_sam import build_sam2_video_predictor

sam2_checkpoint = "../checkpoints/sam2.1_hiera_base_plus.pt"
model_cfg = "configs/sam2.1/sam2.1_hiera_b+.yaml"

predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")

In [31]:
def ann_frames(prompts,frame_idx, obj_id, point, label):
    ann_frame_idx = frame_idx
    ann_obj_id = obj_id
    points = np.array([point], dtype=np.float32)
    labels = np.array([label], np.int32)
    prompts[ann_obj_id] = points, labels
    
    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
    inference_state=inference_state,
    frame_idx=ann_frame_idx,
    obj_id=ann_obj_id,
    points=points,
    labels=labels,
    )
    return out_obj_ids, out_mask_logits

In [8]:
inference_state = predictor.init_state(video_path=SOURCE_FRAMES.as_posix())


predictor.reset_state(inference_state)
prompts = {}  # hold all the clicks we add for visualization

ann_frames(prompts, 16, 0, [93, 128], 1)
ann_frames(prompts, 16, 1, [115, 135], 1)
ann_frames(prompts, 16, 2, [143, 166], 1)
ann_frames(prompts, 16, 3, [164, 147], 1)


#追加
ann_frames(prompts, 20, 1, [145, 107], 1)

ann_frames(prompts, 27, 0, [84, 93], 1)
ann_frames(prompts, 28, 1, [116, 83], 1)



frame loading (JPEG): 100%|██████████| 50/50 [00:01<00:00, 40.30it/s]
Falling back to all available kernels for scaled_dot_product_attention (which may have a slower speed).

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).


([0, 1, 2, 3],
 tensor([[[[-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000],
           [-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000],
           [-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000],
           ...,
           [-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000],
           [-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000],
           [-1024.0000, -1024.0000, -1024.0000,  ..., -1024.0000,
            -1024.0000, -1024.0000]]],
 
 
         [[[   -8.8402,    -9.2123,    -9.4428,  ...,   -10.5031,
              -10.7372,   -11.0384],
           [   -8.3965,   -10.3503,   -10.0079,  ...,   -11.8005,
              -11.2536,   -11.6617],
           [   -9.8099,   -10.0752,    -9.4636,  ...,   -11.0955,
              -11.1119,   -11.0354],
           ...,
           [   -6.00

In [None]:



with sv.VideoSink(TARGET_VIDEO.as_posix(), video_info=video_info) as sink:
    for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
        frame_path = frame_paths[frame_idx]
        frame = cv2.imread(frame_path)
        masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()

        detections = sv.Detections(
            xyxy=sv.mask_to_xyxy(masks=masks),
            mask=masks.astype(bool)
        )
        annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
        sink.write_frame(annotated_frame)

propagate in video: 100%|██████████| 34/34 [01:50<00:00,  3.26s/it]


In [11]:
f_frames = []
b_frames = []
# F (→)
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
    frame_path = frame_paths[frame_idx]
    frame = cv2.imread(frame_path)
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()

    detections = sv.Detections(
        xyxy=sv.mask_to_xyxy(masks=masks),
        mask=masks.astype(bool)
    )

    annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
    f_frames.append(annotated_frame)


# B (←)
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state, reverse=True):
    frame_path = frame_paths[frame_idx]
    frame = cv2.imread(frame_path)
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()

    detections = sv.Detections(
        xyxy=sv.mask_to_xyxy(masks=masks),
        mask=masks.astype(bool)
    )

    annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
    b_frames.append(annotated_frame)

# 全体動画は、スタートのフレームが重複するのと、B (←) が逆再生になるので直す
frames = b_frames[::-1] + f_frames[1:]

# 保存する
with sv.VideoSink(TARGET_VIDEO.as_posix(), video_info=video_info) as sink:
    for f in frames:
        sink.write_frame(f)

propagate in video: 100%|██████████| 34/34 [00:28<00:00,  1.19it/s]
propagate in video: 100%|██████████| 17/17 [00:15<00:00,  1.07it/s]


### annotationのやり直し

In [32]:
import pandas as pd

In [41]:
TARGET_CSV = Path("./psm_live_Results.csv")
ann_csv = pd.read_csv(TARGET_CSV)
ann_csv["Frame"] = ann_csv["Label"].str.split(":").str[2]
ann_csv["Target"] = [i % 4 for i in range(len(ann_csv))]
ann_csv



Unnamed: 0,Unnamed: 1,Label,Area,Mean,StdDev,X,Y,XM,YM,Perim.,...,Slice,FeretX,FeretY,FeretAngle,MinFeret,AR,Round,Solidity,Frame,Target
0,1,Composite-1.tif:0014-0129-0089:14,0.0,28.0,0.0,5.743,8.351,88.97,129.40,0.0,...,14.0,88.97,129.40,0.0,0.0,0.0,0.0,,14,0
1,2,Composite-1.tif:0014-0129-0119:14,0.0,72.0,0.0,7.696,8.327,119.20,129.00,0.0,...,14.0,119.20,129.00,0.0,0.0,0.0,0.0,,14,1
2,3,Composite-1.tif:0014-0167-0151:14,0.0,15.0,0.0,9.732,10.790,150.80,167.20,0.0,...,14.0,150.80,167.20,0.0,0.0,0.0,0.0,,14,2
3,4,Composite-1.tif:0014-0161-0169:14,0.0,42.0,0.0,10.890,10.410,168.70,161.30,0.0,...,14.0,168.70,161.30,0.0,0.0,0.0,0.0,,14,3
4,5,Composite-1.tif:0015-0127-0090:15,0.0,55.0,0.0,5.803,8.220,89.89,127.30,0.0,...,15.0,89.89,127.30,0.0,0.0,0.0,0.0,,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,92,Composite-1.tif:0036-0084-0115:36,0.0,13.0,0.0,7.446,5.410,115.30,83.80,0.0,...,36.0,115.30,83.80,0.0,0.0,0.0,0.0,,36,3
92,93,Composite-1.tif:0037-0079-0079:37,0.0,9.0,0.0,5.076,5.112,78.64,79.19,0.0,...,37.0,78.64,79.19,0.0,0.0,0.0,0.0,,37,0
93,94,Composite-1.tif:0037-0107-0091:37,0.0,9.0,0.0,5.850,6.898,90.63,106.90,0.0,...,37.0,90.63,106.90,0.0,0.0,0.0,0.0,,37,1
94,95,Composite-1.tif:0037-0111-0058:37,0.0,8.0,0.0,3.766,7.136,58.35,110.60,0.0,...,37.0,58.35,110.60,0.0,0.0,0.0,0.0,,37,2


In [42]:
ann_csv["Frame"][0]

'14'

In [44]:
inference_state = predictor.init_state(video_path=SOURCE_FRAMES.as_posix())


predictor.reset_state(inference_state)
prompts = {}  # hold all the clicks we add for visualization

for i in range(len(ann_csv)):
    frame_idx = int(ann_csv["Frame"][i]) - 1
    object_idx = int(ann_csv["Target"][i])
    point = [int(ann_csv["FeretX"][i]), int(ann_csv["FeretY"][i])]
    label = 1
    ann_frames(prompts, frame_idx, object_idx, point, label)

frame loading (JPEG): 100%|██████████| 50/50 [00:01<00:00, 46.48it/s]
Falling back to all available kernels for scaled_dot_product_attention (which may have a slower speed).

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).


In [45]:
f_frames = []
b_frames = []
# F (→)
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
    frame_path = frame_paths[frame_idx]
    frame = cv2.imread(frame_path)
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()

    detections = sv.Detections(
        xyxy=sv.mask_to_xyxy(masks=masks),
        mask=masks.astype(bool)
    )

    annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
    f_frames.append(annotated_frame)


# B (←)
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state, reverse=True):
    frame_path = frame_paths[frame_idx]
    frame = cv2.imread(frame_path)
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()

    detections = sv.Detections(
        xyxy=sv.mask_to_xyxy(masks=masks),
        mask=masks.astype(bool)
    )

    annotated_frame = mask_annotator.annotate(scene=frame.copy(), detections=detections)
    b_frames.append(annotated_frame)

# 全体動画は、スタートのフレームが重複するのと、B (←) が逆再生になるので直す
frames = b_frames[::-1] + f_frames[1:]

# 保存する
with sv.VideoSink(TARGET_VIDEO.as_posix(), video_info=video_info) as sink:
    for f in frames:
        sink.write_frame(f)

propagate in video: 100%|██████████| 37/37 [07:38<00:00, 12.39s/it]
propagate in video: 100%|██████████| 14/14 [09:33<00:00, 41.00s/it]


In [46]:
def visualize_colored_masks(masks):
    h, w = masks.shape[1:]
    canvas = np.zeros((h, w, 3), dtype=np.uint8)
    for m in masks:
        color = np.random.randint(0, 256, size=3).tolist()
        canvas[m.astype(bool)] = color
    return canvas

TARGET_VIDEO = Path("./videos/psm_live_pred_maskonly.mp4")

f_frames = []
b_frames = []

# Forward direction
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()
    mask_only_frame = visualize_colored_masks(masks)
    f_frames.append(mask_only_frame)

# Backward direction
for frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state, reverse=True):
    masks = (out_mask_logits[:, 0, :, :] > 0.0).cpu().numpy()
    mask_only_frame = visualize_colored_masks(masks)
    b_frames.append(mask_only_frame)

# Combine forward and reverse (fix overlap)
frames = b_frames[::-1] + f_frames[1:]

# Save to video
with sv.VideoSink(TARGET_VIDEO.as_posix(), video_info=video_info) as sink:
    for f in frames:
        sink.write_frame(f)


propagate in video: 100%|██████████| 37/37 [13:17<00:00, 21.55s/it] 
propagate in video: 100%|██████████| 14/14 [13:27<00:00, 57.65s/it]
