In [1]:
import cv2
import torch
import mediapipe as mp
import numpy as np
from scipy.interpolate import RectBivariateSpline

In [16]:
model_type = "DPT_Large"
midas = torch.hub.load("intel-isl/MiDaS", model_type)
midas.to("cpu")
midas.eval()

Using cache found in /home/konstantin/.cache/torch/hub/intel-isl_MiDaS_master
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" to /home/konstantin/.cache/torch/hub/checkpoints/dpt_large_384.pt
100.0%


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_featur

In [17]:
transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
print(dir(transforms))
transform = transforms.dpt_transform

['NormalizeImage', 'PrepareForNet', 'Resize', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'apply_min_size', 'beit512_transform', 'cv2', 'default_transform', 'dpt_transform', 'levit_transform', 'math', 'np', 'small_transform', 'swin256_transform', 'swin384_transform']


Using cache found in /home/konstantin/.cache/torch/hub/intel-isl_MiDaS_master


In [18]:
# Converting Depth to distance
def depth_to_distance(depth_value, depth_scale):
    return -1.0 / (depth_value * depth_scale)

In [21]:
import os

folder_path = "saved_frames"
os.makedirs(folder_path, exist_ok=True)

cap = cv2.VideoCapture("Download.mp4")
frame_counter = 0  # Initialize counter

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Break the loop if there are no frames left

    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    imgbatch = transform(img).to("cpu")

    # Making a prediction
    with torch.no_grad():
        prediction = midas(imgbatch)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=img.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()

    output = prediction.cpu().numpy()
    # Normalizing the output predictions for cv2 to read.
    output_norm = cv2.normalize(
        output, None, 0, 1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F
    )

    # Save the frame
    filename = os.path.join(folder_path, f"frame{frame_counter}.png")
    cv2.imwrite(filename, output_norm * 255)  # Multiply by 255 to scale to 0-255

    frame_counter += 1  # Increment the counter

    cv2.imshow("Walking", output_norm)
    if cv2.waitKey(2) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

KeyboardInterrupt: 