In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!pwd

/home/jovyan/ds_nudge_up/playground


In [4]:
import sys
sys.path += ["../"]

In [5]:
import warnings
warnings.filterwarnings("ignore")

Note that we are using the local ds_nudge_up repo only for loading the utils.
The rest is done using the pip installed library.

In [6]:
import os
import src.mosaiks.utils as utl
rasterio_config = utl.load_yaml_config("rasterioc_config.yaml")
os.environ.update(rasterio_config)

In [6]:
!pip uninstall mosaiks -y

[0m

🚨🚨 **Make sure you update github token below** 🚨🚨 

In [7]:
!pip install "git+https://<GITHUB_TOKEN>@github.com/IDinsight/ds_nudge_up@as-package" --upgrade

Collecting git+https://****@github.com/IDinsight/ds_nudge_up@as-package
  Cloning https://****@github.com/IDinsight/ds_nudge_up (to revision as-package) to /tmp/pip-req-build-i4i316i7
  Running command git clone --filter=blob:none --quiet 'https://****@github.com/IDinsight/ds_nudge_up' /tmp/pip-req-build-i4i316i7
  Running command git checkout -b as-package --track origin/as-package
  Switched to a new branch 'as-package'
  Branch 'as-package' set up to track remote branch 'as-package' from 'origin'.
  Resolved https://****@github.com/IDinsight/ds_nudge_up to commit 52a40e8d1ce3e26af5cb81b8fa817efee77a67d5
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: mosaiks
  Building wheel for mosaiks (pyproject.toml) ... [?25ldone
[?25h  Created wheel for mosaiks: filename=mosaiks-0.0.1

# Setup Dask Cluster and Client

## Local Cluster

4 workers with 4 threads each seem to work best. A lot of time a thread is waiting on data to load so CPU is underutilized.

In [1]:
import logging
from dask.distributed import Client, LocalCluster

cluster = LocalCluster(n_workers=4, processes=True, threads_per_worker=4, silence_logs=logging.ERROR)
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,

0,1
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,Workers: 4
Total threads: 16,Total memory: 27.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44287,Workers: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,Total threads: 16
Started: Just now,Total memory: 27.00 GiB

0,1
Comm: tcp://127.0.0.1:38045,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/38847/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:36171,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-wpnim1kn,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-wpnim1kn
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:33947,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/45531/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:45157,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-bdlccz0q,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-bdlccz0q
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:45593,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/46685/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:42605,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-78bwt7_2,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-78bwt7_2
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:45805,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/36775/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:38237,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-iupsuuzw,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-iupsuuzw
GPU: Tesla T4,GPU memory: 16.00 GiB


## Gateway cluster

In [None]:
from dask_gateway import Gateway
import dask_gateway
from dask.distributed import PipInstall

gateway = Gateway()
options = gateway.cluster_options()
options

In [None]:
from dask.distributed import PipInstall

cluster = gateway.new_cluster(options)
client = cluster.get_client()
print(cluster.dashboard_link)

pkg_link = "git+https://<GITHUB_TOKEN>@github.com/IDinsight/ds_nudge_up@as-package"
plugin = PipInstall(packages=[pkg_link], pip_options=["--upgrade"], restart=False)
client.register_worker_plugin(plugin)

cluster.scale(10)

In [None]:
cluster.shutdown()

# Load params and define delayed objects

In [7]:
from mosaiks.featurize import *

from dask import delayed
from dask.distributed import as_completed
from time import sleep
from datetim
import pandas as pd
import numpy as np

In [8]:
featurization_params = utl.load_yaml_config("featurisation.yaml")
satellite_config = utl.load_yaml_config("satellite_config.yaml")
satellite_config = satellite_config[featurization_params['satellite_image_params']['satellite_name']]
data_sources = utl.load_yaml_config('data_catalog.yaml')

In [9]:
points_gdf = utl.load_points_gdf(**data_sources['request_points'])

`fetch_image_refs` now returns a dask dataframe and is not yet computed. So it finishes quite quickly

In [10]:
%%time
points_gdf_with_stac = fetch_image_refs(points_gdf, featurization_params['dask']['n_partitions'], 
                                        featurization_params['satellite_image_params'])

CPU times: user 1.41 s, sys: 288 ms, total: 1.69 s
Wall time: 3.17 s


We use the `delayed` decorator to turn our function into a delayed function. This means it will not run immediately when called but instead return a delayed object that can be run later

In [11]:
@delayed
def partition_run(df, satellite_config, featurization_params, model, device):
    
    data_loader = create_data_loader(df, satellite_config, featurization_params['batch_size'])
    X_features = create_features(data_loader, featurization_params['num_features'], len(df), 
                             model, device, satellite_config['min_image_edge'])
    
    df = pd.DataFrame(X_features, index=df.index.copy())
    
    return df
    

We want to convert our dask dataframe into "delayed" objects. Each partition is now a delayed pandas dataframe and can be passed to our delayed function above

In [12]:
partitions = points_gdf_with_stac.to_delayed()

In [13]:
model = RCF(featurization_params['num_features'], 
            featurization_params['kernel_size'], 
            len(satellite_config['bands']))

Batch size of 10 seems to be optimal balance between maximally using the CPU and not blowing up the memory

In [14]:
featurization_params['batch_size']

10

# Run in parallel

## Trial run

The cell below will only run it for 8 of the partitions. That seems to be about how many we can do in parallel on a local cluster. We may be able to do more on a Gateway Cluster once that is working.

There are also better schemes. For example, kick off another partitions whenever one finishes. That might be a better use of resources.

In [17]:
%%time

dfs = []
for i, p in enumerate(partitions[:8]):
    f = partition_run(p, satellite_config, featurization_params, model, 'cuda', dask_key_name=f'run_{i}')
    dfs.append(f)
dfs = client.compute(dfs, )

df_list = []
for f in as_completed(dfs):
    df_list.append(f.result())


CPU times: user 1min 52s, sys: 23 s, total: 2min 15s
Wall time: 6min 19s


Should take ~7-8 minutes on an MPC GPU instance. So that's <1 minute per partition. If nothing goes wrong, the whole job should finish in <4 hours.

In theory, objects should get garbage collected once there are no references to them. But it seems to take forever (or never!) for python to do that. Possibly since we have a lot of nested things and a model object that we are still holding a reference to.

Restarting the cluster seems to be the sure way of clearing worker memory.

In [18]:
_ = client.restart()

## Full run

This is going to create 200 dataframes - one for each partition. If any fail, we can always just rerun that single component.

In [22]:
N_PARTITIONS = len(partitions)
N_PER_RUN = 8
START_IDX = 16

In [None]:
p_ids = np.arange(START_IDX, N_PARTITIONS + N_PER_RUN, N_PER_RUN)

for p_start_id, p_end_id in zip(p_ids[:-1], p_ids[1:]):
    now = datetime.now().strftime("%d-%b %H:%M:%S")
    print("{timeRunning batch: ", p_start_id, "to", p_end_id-1)
    
    delayed_dfs = []
    for i, p in enumerate(partitions[p_start_id:p_end_id]):
        f = partition_run(p, satellite_config, featurization_params, model, 
                          featurization_params['device'], dask_key_name=f'features_{p_start_id + i}')
        delayed_dfs.append(f)
    futures_dfs = client.compute(delayed_dfs)
    
    for f in as_completed(futures_dfs):
        f.result().to_csv(f'df_{f.key}.csv')
        
    client.restart()
    sleep(5)

Running batch:  16 to 23
Running batch:  24 to 31


In [None]:
client.shutdown()

## Re-run failed partitions

Use this to just run partitions that failed

In [20]:
%%time

FAILED_IDX = [8]

delayed_dfs = []
for i in FAILED_IDX:
    p = partitions[i]
    f = partition_run(p, satellite_config, featurization_params, model, 
                      featurization_params['device'], dask_key_name=f'features_{i}')
    delayed_dfs.append(f)
    futures_dfs = client.compute(delayed_dfs)
    
    for f in as_completed(futures_dfs):
        f.result().to_csv(f'df_{f.key}.csv')

In [21]:
_ = client.restart()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,

0,1
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,Workers: 4
Total threads: 16,Total memory: 27.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44287,Workers: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/8787/status,Total threads: 16
Started: 3 minutes ago,Total memory: 27.00 GiB

0,1
Comm: tcp://127.0.0.1:34423,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/37187/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:36171,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-qb1zuxsh,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-qb1zuxsh
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:44353,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/46041/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:45157,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-rfoc134x,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-rfoc134x
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:36825,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/40957/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:42605,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-e6__6tv9,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-e6__6tv9
GPU: Tesla T4,GPU memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:36347,Total threads: 4
Dashboard: /user/sid.ravi1@gmail.com/proxy/44833/status,Memory: 6.75 GiB
Nanny: tcp://127.0.0.1:38237,
Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-9k5iv3i5,Local directory: /home/jovyan/ds_nudge_up/playground/dask-worker-space/worker-9k5iv3i5
GPU: Tesla T4,GPU memory: 16.00 GiB
