<a href="https://colab.research.google.com/github/Helyousfi/Depth-estimation-and-detection/blob/main/depthExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on https://pytorch.org/hub/intelisl_midas_v2/ 
and https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/intelisl_midas_v2.ipynb

### Reference
[Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https://arxiv.org/abs/1907.01341)

Their code is on MIT license https://github.com/intel-isl/MiDaS/blob/master/LICENSE . Please cite paper if you use their model:

In [1]:
'''@article{Ranftl2020,
	author    = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},
	title     = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},
	journal   = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
	year      = {2020},
}'''

"@article{Ranftl2020,\n\tauthor    = {Ren'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun},\n\ttitle     = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},\n\tjournal   = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},\n\tyear      = {2020},\n}"

In [2]:
#Set runtime to GPU

#importing libraries
!pip install timm
import tifffile as ti
import random
import numpy as np
import torch
#load midas model from torch hub
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS")
midas.eval()
import cv2
import urllib.request
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
import numpy as np
import os

use_large_model = True

if use_large_model:
    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS")
else:
    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
  
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
midas.to(device)
midas.eval()

midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")

if use_large_model:
    transform = midas_transforms.default_transform
else:
    transform = midas_transforms.small_transform

Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[?25l[K     |▊                               | 10 kB 19.0 MB/s eta 0:00:01[K     |█▌                              | 20 kB 25.8 MB/s eta 0:00:01[K     |██▎                             | 30 kB 30.8 MB/s eta 0:00:01[K     |███                             | 40 kB 22.8 MB/s eta 0:00:01[K     |███▉                            | 51 kB 18.3 MB/s eta 0:00:01[K     |████▋                           | 61 kB 20.7 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 22.2 MB/s eta 0:00:01[K     |██████                          | 81 kB 23.7 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 25.6 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 23.7 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 23.7 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 23.7 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 23.7 MB/s eta 0:00:01

Downloading: "https://github.com/intel-isl/MiDaS/archive/master.zip" to /root/.cache/torch/hub/master.zip


Loading weights:  None


Downloading: "https://github.com/facebookresearch/WSL-Images/archive/main.zip" to /root/.cache/torch/hub/main.zip
Downloading: "https://download.pytorch.org/models/ig_resnext101_32x8-c38310e5.pth" to /root/.cache/torch/hub/checkpoints/ig_resnext101_32x8-c38310e5.pth


  0%|          | 0.00/340M [00:00<?, ?B/s]

Downloading: "https://github.com/intel-isl/MiDaS/releases/download/v2_1/model-f6b98070.pt" to /root/.cache/torch/hub/checkpoints/model-f6b98070.pt


  0%|          | 0.00/403M [00:00<?, ?B/s]

Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/facebookresearch_WSL-Images_main
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Load images from zipped **in.zip** file, run neural network and output to **out8tif.zip** (8 bit, 3 channels), **out32tif3d.zip** (32 bit, xyz channels) and **out32tif1d.zip** (32 bit, zzz channels)

In [3]:
# unzip input folder
!unzip folder.zip 

unzip:  cannot find or open folder.zip, folder.zip.zip or folder.zip.ZIP.


In [4]:
# look for files in the input folder
from os import walk
_, _, filenames = next(walk('in2'))
filenames = sorted(filenames)
print(filenames)

# create output folders for 8-bit simple output and 32-bit fully 3d vector output
try:
  os.makedirs('out8tif')
except:
  print('Folder not created')
try:
  os.makedirs('out32tif3d')
except:
  print('Folder not created')

# for each filename do the depth extraction and save it
for filename in filenames:
  # read file
  img = cv2.imread('in2/'+filename)
  # convert color space from BGR to RGB
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  # run midas model
  input_batch = transform(img).to(device)
  with torch.no_grad():
    prediction = midas(input_batch)

    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img.shape[:2],
        mode="bicubic",
        align_corners=False,
    ).squeeze()
  # convert output to numpy array
  output = prediction.cpu().numpy()

  # rescale output for simple depth extraction
  min = np.min(output)
  max = np.max(output)
  output2 = 255.99*(output-min)/(max-min)
  output2 = output2.astype(int)
  output2 = np.stack((output2,)*3, axis=-1)
  # save simple output together with blurred versions
  cv2.imwrite('out8tif/'+filename+'.tif', output2)
  cv2.imwrite('out8tif/blur5'+filename+'.tif', cv2.blur(output2,(5,5)))
  cv2.imwrite('out8tif/blur15'+filename+'.tif', cv2.blur(output2,(15,15)))
  cv2.imwrite('out8tif/blur35'+filename+'.tif', cv2.blur(output2,(35,35)))

  ## 32 bit version, mostly vector output
  # Read shape of the depth map
  (wy, wx) = output.shape

  #distance from the camera to the image frame in arbitrary units
  #"hat" suffix refers to arbitrary units 
  d1hat = wy# this is how I have choosen, but it can be changed
  wxhat = wx
  wyhat = wy

  # rmin and rmax are two scaling factors
  # rmin roughly means distance of the closest point visible in the image to the center of the camera
  # rmax roughly means distance of the furthest point visible in the image to the center of the camera
  # You can play with the values for different results.
  # The larger is the difference between rmin and rmax, the more detailed mesh and number of bits are needed.
  # Especially transformation from macro photography to buildings etc. may need rmin and rmax changes.
  # rmin should rather be larger than distance from the camera to the image point.
  rmin = 1*np.sqrt(d1hat**2+(np.max([wyhat,wxhat]))**2) #or maybe: 2*d1hat or something else :)
  rmax = 100*rmin # or something else

  # Distance from the camera. "star" means inverse units and rstar = a/r + b, 
  # where r is a real distance in some units and a and b are some constants that we are trying to retrieve based on rmin and rmax.
  rstar = output

  rstarmin = np.max(rstar) # max deliberately, because far points have low rstar
  rstarmax = np.min(rstar) # min deliberately, because close points have high rstar

  # Extraction of distance based on distance inverse and assumed parameters.
  r = rmin*(rstar/(rstarmin-rstarmax) - (rstarmax/(rstarmin-rstarmax)-(rmin/rmax)*rstarmin/(rstarmin-rstarmax)))**(-1)

  # Recalculation of x,y coordinates to pixel coordinates with the center in the image plane instead of top left corner.
  xtilde = np.fromfunction(lambda i, j: j+0.5-wx/2, (wy,wx))
  ytilde = np.fromfunction(lambda i, j: -(i+0.5-wy/2), (wy,wx))

  # Distance of the point on the image plane from the camera.
  r1 = np.sqrt(d1hat**2+xtilde**2+ytilde**2)

  # Distance of the point from the image plane to the final/real position.
  r2 = r-r1

  # Transformations of the points from image plane to their final positions.
  dx = (xtilde*r2/r1).astype(np.float32)
  dy = (ytilde*r2/r1).astype(np.float32)
  dz = (-d1hat*r2/r1).astype(np.float32)

  # Max and min displacement ofor x,y and z.
  allmax = np.abs(np.max(np.max([dx,dy,dz])))
  allmin = np.abs(np.min(np.min([dx,dy,dz])))

  # Max absolute displacement.
  totmax = np.max([allmin,allmax])

  # Recalculation to float 32 bit. I have assumed that 0.5 is no displacement, 
  # but it does not work as well as for 8 bit png. I do not know why, but from 
  # displacement operator point of view you just have to change midlevel.
  dxfile = (dx/totmax/2+0.5).astype(np.float32)
  dyfile = (dy/totmax/2+0.5).astype(np.float32)
  dzfile = (dz/totmax/2+0.5).astype(np.float32)

  # Prepare data with 3d displacement.
  output3d = np.dstack((dxfile,dyfile,dzfile))
  output3d = output3d.astype(np.float32)
  # Save file with 3d displacement
  ti.imsave('out32tif3d/'+filename+'3d.tif', output3d)

# Zip the outcome folders.
!zip out8tif.zip -r out8tif
#!zip out32tif1d.zip -r out32tif1d
!zip out32tif3d.zip -r out32tif3d
