In [None]:
%load_ext autoreload
%autoreload 2

## Install the mosaiks package

In [None]:
# Locally
# !pip install -e .. --upgrade

In [None]:
# From github
# 🚨 Make sure you update github token in the secrets file 🚨 
# import src.mosaiks.utils as utl
# mosaiks_package_link = utl.get_mosaiks_package_link
# !pip install {mosaiks_package_link} --upgrade

## Import packages

In [None]:
import sys
import os
import warnings

sys.path += ["../"]
warnings.filterwarnings("ignore")

In [None]:
import mosaiks.utils as utl
from mosaiks.featurize import *
from mosaiks.dask_run import *

# Setup Rasterio

In [None]:
rasterio_config = utl.load_yaml_config("rasterioc_config.yaml")
os.environ.update(rasterio_config)

# Setup Dask Cluster and Client

In [None]:
client = get_dask_client(kind="local")

# Load params

In [None]:
featurization_params = utl.load_yaml_config("featurisation.yaml")
satellite_config = utl.load_yaml_config("satellite_config.yaml")
satellite_config = satellite_config[
    featurization_params["satellite_search_params"]["satellite_name"]
]
coord_set_name = "request_points_grid_05"

# Load point coords

In [None]:
request_points_gdf = utl.load_df_w_latlons_to_gdf(dataset_name=coord_set_name)

In [None]:
points_gdf = request_points_gdf.sample(200, random_state=0) # Select random 200 points (for testing)

# Fetch image stac refs

`fetch_image_refs` now returns a dask dataframe and is not yet computed. So it finishes quite quickly.

In [None]:
%%time
points_gdf_with_stac = fetch_image_refs(
    points_gdf, 
    featurization_params['dask']['chunksize'],
    featurization_params['satellite_search_params']
)

In [None]:
partitions = points_gdf_with_stac.to_delayed()

In [None]:
model = RCF(
    featurization_params["num_features"],
    featurization_params["kernel_size"],
    len(satellite_config["bands"]),
)

# Run in parallel

## Trial run

8 simultaneous partitions seems to be about how many we can do in parallel on a local cluster. We may be able to do more on a Gateway Cluster once that is working.

TODO - CHANGE TO THIS SCHEME: There are also better schemes. For example, kick off another partitions whenever one finishes. That might be a better use of resources.

In [None]:
%%time

df = run_single_partition(
    partitions[0], satellite_config, featurization_params, model, client
)

In [None]:
print("Average feature value:", df.mean().mean())
df.iloc[0].hist()
_ = client.restart()

In [None]:
n_per_run = 8

run_partitions(
    partitions[:n_per_run],
    n_per_run,
    satellite_config,
    featurization_params,
    model,
    client,
    mosaiks_folder_path=None,
    partition_ids=None,
)

## Full run

This is going to create 200 dataframes - one for each partition. If any fail, we can always just rerun that single component.

### Setup saving location

In [None]:
mosaiks_folder_path = utl.make_features_path_from_dict(featurization_params, coord_set_name)

### Create features and save checkpoints to file

In [None]:
n_per_run = 8

failed_partition_ids = run_partitions(
    partitions,
    n_per_run,
    satellite_config,
    featurization_params,
    model,
    client,
    mosaiks_folder_path
)

## Re-run failed partitions

Use this to just run partitions that failed

In [None]:
%%time

failed_partition_ids_1 = run_partitions(
    partitions,
    n_per_run,
    satellite_config,
    featurization_params,
    model,
    client,
    mosaiks_folder_path,
    partition_ids=failed_partition_ids,
)

# Load checkpoint files and combine

In [None]:
checkpoint_filenames = utl.get_filtered_filenames(mosaiks_folder_path, prefix="df_")

In [None]:
combined_df = utl.load_and_combine_dataframes(mosaiks_folder_path, checkpoint_filenames)
combined_df = combined_df.join(points_gdf[["Lat", "Lon", "shrid"]])

print("Dataset size in memory (MB):", combined_df.memory_usage().sum() / 1000000)

In [None]:
%%time
combined_filename = "features.parquet.gzip"
utl.save_dataframe(combined_df, file_path=mosaiks_folder_path / combined_filename)