In [None]:
%pip install mediapipe
%pip install timm

Collecting mediapipe
  Downloading mediapipe-0.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.8/34.8 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.10 sounddevice-0.4.6
Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: timm
Successfully installed timm-0.9.16


In [None]:
import cv2
import torch
import mediapipe as mp
import numpy as np
from scipy.interpolate import RectBivariateSpline

In [None]:
model_type = "DPT_Large"
midas = torch.hub.load("intel-isl/MiDaS", model_type)
midas.to("cuda")
midas.eval()

Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" to /root/.cache/torch/hub/checkpoints/dpt_large_384.pt
100%|██████████| 1.28G/1.28G [00:04<00:00, 291MB/s]


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_featur

In [None]:
transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
print(dir(transforms))
transform = transforms.dpt_transform

['NormalizeImage', 'PrepareForNet', 'Resize', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'apply_min_size', 'beit512_transform', 'cv2', 'default_transform', 'dpt_transform', 'levit_transform', 'math', 'np', 'small_transform', 'swin256_transform', 'swin384_transform']


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


In [None]:
# Converting Depth to distance
def depth_to_distance(depth_value, depth_scale):
    return -1.0 / (depth_value * depth_scale)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from google.colab.patches import cv2_imshow
from time import time

folder_path = "drive/MyDrive/saved_frames_real_round3"
os.makedirs(folder_path, exist_ok=True)

cap = cv2.VideoCapture("drive/MyDrive/vid_720p.mp4")
frame_counter = 0  # Initialize counter

while cap.isOpened():
    start = time()

    ret, frame = cap.read()
    if not ret:
        break  # Break the loop if there are no frames left

    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    imgbatch = transform(img).to("cuda")

    # Making a prediction
    with torch.no_grad():
        prediction = midas(imgbatch)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=img.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()

    output = prediction.cpu().numpy()
    # Normalizing the output predictions for cv2 to read.
    output_norm = cv2.normalize(
        output, None, 0, 1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F
    )

    # Save the frame
    filename = os.path.join(folder_path, f"frame{frame_counter}.png")
    cv2.imwrite(filename, output_norm * 255)  # Multiply by 255 to scale to 0-255

    print(frame_counter, time() - start)
    frame_counter += 1  # Increment the counter

    if cv2.waitKey(2) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
17241 0.08861088752746582
17242 0.0886986255645752
17243 0.08990120887756348
17244 0.08937263488769531
17245 0.08677959442138672
17246 0.09132695198059082
17247 0.08826279640197754
17248 0.08891582489013672
17249 0.08777904510498047
17250 0.08927655220031738
17251 0.08760499954223633
17252 0.08786463737487793
17253 0.08697295188903809
17254 0.08831954002380371
17255 0.09017586708068848
17256 0.08894872665405273
17257 0.08921456336975098
17258 0.08957219123840332
17259 0.10474920272827148
17260 0.08914947509765625
17261 0.09016537666320801
17262 0.08894705772399902
17263 0.08804655075073242
17264 0.08871579170227051
17265 0.08897852897644043
17266 0.08939003944396973
17267 0.08877182006835938
17268 0.08907961845397949
17269 0.0878152847290039
17270 0.0886545181274414
17271 0.08808493614196777
17272 0.10100603103637695
17273 0.08878254890441895
17274 0.08776402473449707
17275 0.0883018970489502
17276 0.08928322792053223
172