In [77]:
import torchvision.transforms as transforms
import cv2
import numpy as np
import numpy
import torch
import torchvision
import argparse
import time
from PIL import Image

In [78]:
#creating label color map list for each segmentation class

label_color_map = [
               (0, 0, 0),  # background
               (128, 0, 0), # aeroplane
               (0, 128, 0), # bicycle
               (128, 128, 0), # bird
               (0, 0, 128), # boat
               (128, 0, 128), # bottle
               (0, 128, 128), # bus 
               (128, 128, 128), # car
               (64, 0, 0), # cat
               (192, 0, 0), # chair
               (64, 128, 0), # cow
               (192, 128, 0), # dining table
               (64, 0, 128), # dog
               (192, 0, 128), # horse
               (64, 128, 128), # motorbike
               (192, 128, 128), # person
               (0, 64, 0), # potted plant
               (128, 64, 0), # sheep
               (0, 192, 0), # sofa
               (128, 192, 0), # train
               (0, 64, 128) # tv/monitor
]

In [79]:
# define the torchvision image transforms
# 모델이 coco 데이터 세트로 훈련되어 있기 때문에 정규화를 진행

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.255])
])

In [80]:
def get_segment_label(image, model, device):
    
    image = transform(image).to(device)
    image = image.unsqueeze(0)
    outputs = model(image)
    
    return outputs

In [81]:
#색상 마스크 적용
def draw_segmentation_map(outputs):
    labels = torch.argmax(outputs.squeeze(), dim=0).detach().cpu().numpy()
    red_map = np.zeros_like(labels).astype(np.uint8)
    green_map = np.zeros_like(labels).astype(np.uint8)
    blue_map = np.zeros_like(labels).astype(np.uint8)
    
    for label_num in range(0, len(label_color_map)):
        index = labels == label_num
        red_map[index] = np.array(label_color_map)[label_num,0]
        green_map[index] = np.array(label_color_map)[label_num, 1]
        blue_map[index] = np.array(label_color_map)[label_num, 2]
        
    segmented_image = np.stack([red_map, green_map, blue_map], axis=2)
    return segmented_image

In [82]:
#원본 이미지 위에 분할된 색상 마스크 적용
def image_overlay(image, segmented_image):
    alpha = 0.6        # 적용될 투명도
    beta = 1 - alpha   # 원본에 적용될 weight, beta + alpha = 1
    gamma = 0          # 추가되는 Scalar 값
    
    image = np.array(image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    segmented_image = cv2.cvtColor(segmented_image, cv2.COLOR_RGB2BGR)
    cv2.addWeighted(segmented_image, alpha, image, beta, gamma, image) #원본 이미지인 image 위에 색상 마스크가 적용된 segmented_image를 적용
    return image

In [90]:
# 사전에 학습되어 있는 모델 불러오기, 
model = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(pretrained=True)

# GPU가 있다면 GPU 사용 없다면 CPU 사용
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델을 eval() 모드로 전환 후 사용할 device에 로드
model.eval().to(device)

DeepLabV3(
  (backbone): IntermediateLayerGetter(
    (0): ConvBNActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): Identity()
        )
      )
    )
    (2): InvertedResidual(
      (block): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 64, kernel_size=(1, 

In [91]:
# vtest.avi VideoCapture 객체에 로드
cap = cv2.VideoCapture("vtest.avi")
if (cap.isOpened() == False):
    print('Video open failed!!')

In [92]:
# 프레임 너비와 높이
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

frame_count = 0  # 총 프레임 수 
total_fps = 0    # 초당 최종 프레임

In [93]:
#Apply Deep Learning Image Segmentation to Each Video Frame

while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        start_time = time.time()
        with torch.no_grad():
            outputs = get_segment_label(frame, model, device)
            
        segmented_image = draw_segmentation_map(outputs['out'])
        
        final_image = image_overlay(frame, segmented_image)
        
        end_time = time.time()
        fps = 1 / (end_time - start_time)
        total_fps += fps
        frame_count += 1
        
        # 영상 왼쪽 상단에 프레임 수 표시
        cv2.putText(final_image, f"{fps:.3f} FPS", (20,35), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

        cv2.imshow('detect', final_image)
        out.write(final_image)
        
        # 'q' 키를 누르면 종료
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    else:
        break

cap.release()
cv2.destroyAllWindows()

avg_fps = total_fps / frame_count
print(f"Average FPS: {avg_fps:.3f}")

Average FPS: 7.892
