In [None]:
import urllib
import torch
import matplotlib
from torchvision import transforms

from PIL import Image
from depth import getDepthHead
from segmentation import getSegmentationHead, getSegmentationModel
from mmseg.apis import inference_segmentor

In [None]:
test_image = Image.open(r"S:/CS7643Project/audiosetdl/dataset/image/3-9.#alligators, crocodiles hissing#.train.Alligators Blackwater & Thrasher!  VLOGMAS Day 15.jpg")
display(test_image)

In [None]:
def make_depth_transform() -> transforms.Compose:
    return transforms.Compose([
        transforms.ToTensor(),
        lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
        transforms.Normalize(
            mean=(123.675, 116.28, 103.53),
            std=(58.395, 57.12, 57.375),
        ),
        transforms.Resize((240, 320)),
    ])


def render_depth(values, colormap_name="magma_r") -> Image:
    min_value, max_value = values.min(), values.max()
    normalized_values = (values - min_value) / (max_value - min_value)

    colormap = matplotlib.colormaps[colormap_name]
    colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4)
    colors = colors[:, :, :3] # Discard alpha component
    return Image.fromarray(colors)


transform = make_depth_transform()

scale_factor = 1
rescaled_image = test_image.resize((scale_factor * test_image.width, scale_factor * test_image.height))
transformed_image = transform(rescaled_image)
batch = transformed_image.unsqueeze(0).cuda() # Make a batch of one image

model = getDepthHead().cuda()

with torch.inference_mode():
    result = model.whole_inference(batch, img_meta=None, rescale=True)

depth_image = render_depth(result.squeeze().cpu())
display(depth_image)

In [None]:
import numpy as np

import model.dinov2.eval.segmentation.utils.colormaps as colormaps


DATASET_COLORMAPS = {
    "ade20k": colormaps.ADE20K_COLORMAP,
    "voc2012": colormaps.VOC2012_COLORMAP,
}

HEAD_DATASET = "voc2012"

def make_segmentation_transform() -> transforms.Compose:
    return transforms.Compose([
        # transforms.ToTensor(),
        # lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
        # transforms.Normalize(
        #     mean=(123.675, 116.28, 103.53),
        #     std=(58.395, 57.12, 57.375),
        # ),
        transforms.Resize((240, 320)),
    ])

def render_segmentation(segmentation_logits, dataset):
    colormap = DATASET_COLORMAPS[dataset]
    colormap_array = np.array(colormap, dtype=np.uint8)
    segmentation_logits[segmentation_logits + 1 >= colormap_array.shape[0]] = colormap_array.shape[0] - 2
    segmentation_values = colormap_array[segmentation_logits + 1]
    return Image.fromarray(segmentation_values)

transform = make_segmentation_transform()

transformed_image = transform(test_image)

seg_model = getSegmentationModel()

array = np.array(transformed_image)[:, :, ::-1] # BGR
segmentation_logits = inference_segmentor(seg_model, array)[0]
segmented_image = render_segmentation(segmentation_logits, HEAD_DATASET)
display(segmented_image)

In [None]:
import numpy as np

In [None]:
def tensor_transform() -> transforms.Compose:
    return transforms.Compose([
        transforms.ToTensor(),
    ])
tensor_transformer = tensor_transform()
transformed_segmented_image = np.array(tensor_transformer(segmented_image)).transpose((1, 2, 0))
transformed_depth_image = np.array(tensor_transformer(depth_image)).transpose((1, 2, 0))

In [None]:
transformed_segmented_image.shape

In [None]:
transformed_depth_image.shape

In [None]:
import cv2

In [None]:
gray_segmented_image = cv2.cvtColor(transformed_segmented_image, cv2.COLOR_BGR2GRAY)
cv2.imshow("image", gray_segmented_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
gray_depth_image = cv2.cvtColor(transformed_depth_image, cv2.COLOR_BGR2GRAY)
cv2.imshow("image", gray_depth_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
rounded_segmented_image = np.around(gray_segmented_image, decimals=2)
cv2.imshow("image", rounded_segmented_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
rounded_depth_image = np.around(gray_depth_image, decimals=2)
cv2.imshow("image", rounded_depth_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
gray_segmented_image==0.44

In [None]:
test_gray_image = np.copy(rounded_segmented_image)
test_gray_image[test_gray_image!=0.47] = 0
test_gray_image[test_gray_image==0.47] = 1
cv2.imshow("image", test_gray_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
print(test_gray_image)

In [None]:
def original_transform() -> transforms.Compose:
    return transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((240, 320)),
    ])
original_tranformer = original_transform()
transformed_original_image = np.array(original_tranformer(test_image)).transpose((1, 2, 0))[:, :, ::-1]
cv2.imshow("image", transformed_original_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
cv_original_image = cv2.imread(r"S:/CS7643Project/audiosetdl/dataset/image/3-9.#alligators, crocodiles hissing#.train.Alligators Blackwater & Thrasher!  VLOGMAS Day 15.jpg")
cv_original_image = cv2.resize(cv_original_image, (320, 240))
cv2.imshow("image", cv_original_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
import math

In [None]:
ROI_number = 0
volume_factor = []
for gray_scale in list(np.array(range(101)) / 100.0):
    # Morph open to remove noise
    test_gray_image = np.copy(rounded_segmented_image)
    test_gray_image[test_gray_image!=gray_scale] = 0
    test_gray_image[test_gray_image==gray_scale] = 1
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
    opening = cv2.morphologyEx(test_gray_image, cv2.MORPH_OPEN, kernel, iterations=1).astype('uint8')

    masked_depth_image = np.multiply(gray_depth_image, test_gray_image)

    # Find contours, obtain bounding box, extract and save ROI
    cnts = cv2.findContours(opening, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for idx, c in enumerate(cnts):
        x,y,w,h = cv2.boundingRect(c)
        # cv2.rectangle(cv_original_image, (x, y), (x + w, y + h), (36,255,12), 0.1)
        if w < 32 or h < 32:
            continue
        c_mask = np.zeros((240, 320), np.uint8)
        cv2.drawContours(c_mask, cnts, idx, 255, -1)
        # ROI_depth = masked_depth_image[y:y+h, x:x+w]
        ROI_depth = masked_depth_image[c_mask == 255]
        ROI_depth_mean = np.mean(ROI_depth)
        ROI_depth_factor = ROI_depth_mean ** 2

        ROI_horizontal_ratio = ((x + 0.5 * w) - 160) / 320.0
        HALF_ANGLE_TAN = math.tan(5 * math.pi / 36)
        ROI_horizontal_angle = math.atan(2 * abs(ROI_horizontal_ratio) * HALF_ANGLE_TAN)

        if ROI_horizontal_ratio < 0:
            ROI_horizontal_factor_leftC = math.cos(((0.5 * math.pi) - ROI_horizontal_angle) / 2)
            ROI_horizontal_factor_rightC = math.sin(((0.5 * math.pi) - ROI_horizontal_angle) / 2)
        else:
            ROI_horizontal_factor_leftC = math.cos(((0.5 * math.pi) + ROI_horizontal_angle) / 2)
            ROI_horizontal_factor_rightC = math.sin(((0.5 * math.pi) + ROI_horizontal_angle) / 2)
        
        volume_factor.append((0.5 * ROI_depth_factor + 0.5 * ROI_horizontal_factor_leftC, 0.5 * ROI_depth_factor + 0.5 * ROI_horizontal_factor_rightC))
        ROI = cv_original_image[y:y+h, x:x+w]
        cv2.imwrite('test_images/ROI-{}.png'.format(ROI_number), ROI)
        ROI_number += 1

    # cv2.imshow('image', cv_original_image)
    # cv2.imshow('thresh', test_gray_image)
    # cv2.imshow('opening', opening)
    # cv2.waitKey()

print(volume_factor)

In [None]:
from ultralytics import YOLO

detection_model = YOLO('yolov8m.pt')
results = detection_model(test_image)

In [None]:
print(results)