# Use Deeplake to construct a dataset as fast as possible (write heavy)

In [22]:
import os
import deeplake as dl
import numpy as np
from termcolor import colored
import psutil
import shutil

# construct dataset 
# RESULTS_DATASET_PATH = '/mnt/weka/deeplake/deeplake-ds-200k'
# RESULTS_DATASET_PATH = '/dev/kas_temp/deeplake-ds-25k'
RESULTS_DATASET_PATH = '/fsx/kas_temp/deeplake-ds-200k'

shutil.rmtree(RESULTS_DATASET_PATH, ) if os.path.exists(RESULTS_DATASET_PATH) else None
os.makedirs(RESULTS_DATASET_PATH, exist_ok=True)

In [23]:
print(colored(f"👉 Creating output database at {RESULTS_DATASET_PATH}", "cyan", attrs=["reverse", "bold"]))
output_ds = dl.empty(RESULTS_DATASET_PATH, overwrite=True)
with output_ds:
  # tf_bfloat16 = _pywrap_bfloat16.TF_bfloat16_type() # couldn't get this working weird imports.
  output_ds.create_tensor("context_vector", htype="generic", dtype=np.float32, sample_compression=None) #) "lz4")
  # output_ds.create_tensor("label", htype="text", dtype=str, sample_compression=None)
  output_ds.flush()

[1m[7m[36m👉 Creating output database at /fsx/kas_temp/deeplake-ds-200k[0m
/fsx/kas_temp/deeplake-ds-200k loaded successfully.


# Test read performance: Use the Deeplake Dataset as a Pytorch Dataloader

In [24]:
RESULTS_DATASET_PATH = '/fsx/kas_temp/deeplake-ds-200k'
os.remove(os.path.join(RESULTS_DATASET_PATH, 'dataset_lock.lock')) if os.path.exists(os.path.join(RESULTS_DATASET_PATH, 'dataset_lock.lock')) else None

fifty_gb = int(20 * 1e9)
output_ds = dl.load(RESULTS_DATASET_PATH, read_only=False, memory_cache_size=fifty_gb)

/fsx/kas_temp/deeplake-ds-200k loaded successfully.

 

In [32]:
import torch
dataloader= output_ds.pytorch(batch_size = 16, num_workers = 2, 
    # transform = transform, 
    tensors = ['context_vector'],
    shuffle = True)


ModuleNotFoundError: No module named 'torch'

In [31]:
!pip install torch

Collecting torch
  Using cached torch-2.0.0-cp38-cp38-manylinux1_x86_64.whl (619.9 MB)
Collecting nvidia-cuda-cupti-cu11==11.7.101
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m588.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cusolver-cu11==11.4.0.1
  Using cached nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)
Collecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00

In [None]:
for data in dataloader:
    print(data)    
    break

# Restart notebook here, close connection to output_ds

In [None]:
import os
import deeplake as dl
import numpy as np
from termcolor import colored
import psutil

# construct dataset 
RESULTS_DATASET_PATH = '/mnt/weka/deeplake/deeplake-ds-3'
os.makedirs(RESULTS_DATASET_PATH, exist_ok=True)

In [None]:
# Populate dataset, in parallel, using all but 1 CPU core. 
@dl.compute
def populate_ds_with_zeros(sample_in, sample_out, min_val, max_val, arr_shape, dtype=np.float32):
  # caption = sample_in.caption.numpy()
  sample_out.context_vector.append( np.array(np.random.uniform(min_val,max_val,arr_shape), dtype=dtype) )
  return sample_out

# experiment settings
min = np.finfo(np.float32).min
max = np.finfo(np.float32).max
arr_shape = (1024,1024)
dtype = np.float32
dataset_size = [None] * 1_000

output_ds = dl.load(RESULTS_DATASET_PATH)

In [None]:
%%timeit
populate_ds_with_zeros(min_val=min, max_val=max, arr_shape=arr_shape, dtype=dtype).eval(dataset_size, output_ds, scheduler="ray", num_workers=psutil.cpu_count()-1, skip_ok=True)

In [None]:
output_ds = dl.load(RESULTS_DATASET_PATH)
output_ds.summary()