# Notebook prep

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [2]:
# Set the below env var so that GeoPandas only uses Shapely.
# This is to avoid a conflict between Shapely and PyGEOS. 
# NOTE: Must happen before geopandas import
import os
os.environ["USE_PYGEOS"] = "0"

In [3]:
import logging
import sys
import warnings

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

sys.path += ["../"]
warnings.filterwarnings("ignore")

In [5]:
from pathlib import Path
import mosaiks.utils as utl
from mosaiks.featurize import RCF

## Setup Rasterio

In [6]:
rasterio_config = utl.load_yaml_config("rasterioc_config.yaml")
os.environ.update(rasterio_config)

# Load params + defaults

In [8]:
featurization_config = utl.load_yaml_config("featurisation_config.yaml")
satellite_config = utl.load_yaml_config("satellite_config.yaml")
satellite_config = satellite_config[
    featurization_config["satellite_search_params"]["satellite_name"]
]

In [9]:
mosaiks_col_names = [
    f"mosaiks_{i}" for i in range(featurization_config["model"]["num_features"])
]
test_mosaiks_folder_path = Path("test_outputs")

In [10]:
model = RCF(
    featurization_config["model"]["num_features"],
    featurization_config["model"]["kernel_size"],
    len(satellite_config["bands"]),
)

# Load Data

In [13]:
import pandas as pd
points_gdf = utl.df_w_latlons_to_gdf(pd.read_csv("../tests/data/test_points.csv"))

In [None]:
# Select random 1000 points (for testing)
# points_gdf = request_points_gdf.iloc[:1000]

# Simple non-Dask run

In [14]:
test_points_gdf = points_gdf.iloc[1:2]

## Check pipeline run

In [17]:
from mosaiks.dask import get_features_without_parallelization as full_pipeline

In [18]:
%%time
df_1 = full_pipeline(
    test_points_gdf,
    model,
    featurization_config,
    satellite_config,
    col_names=mosaiks_col_names,
    output_folderpath=test_mosaiks_folder_path,
    save_filename="TEST_df.csv",
    return_df=True,
)
df_1

CPU times: user 1.06 s, sys: 666 ms, total: 1.73 s
Wall time: 8.77 s


Unnamed: 0,mosaiks_0,mosaiks_1,mosaiks_2,mosaiks_3,mosaiks_4,mosaiks_5,mosaiks_6,mosaiks_7,mosaiks_8,mosaiks_9,...,mosaiks_3990,mosaiks_3991,mosaiks_3992,mosaiks_3993,mosaiks_3994,mosaiks_3995,mosaiks_3996,mosaiks_3997,mosaiks_3998,mosaiks_3999
1,0.0,1.590576,0.0,0.0,1.919222,0.003695,1.350078,0.000195,0.216465,2.053044,...,4.904264,8.4e-05,3.280439,0.0,6.595667,1.122086,0.017674,2.454667,4.187478,2.6e-05


## Check pipeline components

In [None]:
from mosaiks.fetch import fetch_image_refs, create_data_loader
from mosaiks.featurize import create_features, make_result_df

In [None]:
points_gdf_with_stac = fetch_image_refs(
    test_points_gdf, 
    featurization_config['satellite_search_params']
)

In [None]:
# check image crop fetching function and display crop
from mosaiks.fetch import fetch_image_crop, display_image

row = points_gdf_with_stac.iloc[0]

image_crop = fetch_image_crop(
    lon=row["Lon"],
    lat=row["Lat"],
    stac_item=row["stac_item"],
    buffer=satellite_config["image_width"],
    bands=satellite_config["bands"],
    resolution=satellite_config["resolution"],
    dtype=satellite_config["dtype"],
    normalise=True,
)

display_image(image_crop)

In [None]:
data_loader = create_data_loader(
    points_gdf_with_stac=points_gdf_with_stac,
    satellite_params=satellite_config,
    batch_size=featurization_config["model"]["batch_size"],
)

In [None]:
X_features = create_features(
    dataloader=data_loader,
    n_features=featurization_config["model"]["num_features"],
    model=model,
    device=featurization_config["model"]["device"],
    min_image_edge=satellite_config["min_image_edge"],
)

In [None]:
result_df = make_result_df(
    features=X_features,
    mosaiks_col_names=mosaiks_col_names,
    context_gdf=points_gdf_with_stac,
    context_cols_to_keep=featurization_config["coord_set"]["context_cols_to_keep"],
)
result_df

In [None]:
utl.save_dataframe(
    df=result_df, file_path=test_mosaiks_folder_path / "TEST_components_df.csv"
)

## Test post-featurization image fetching (for debugging)

In [None]:
# choose featurised data
# data = utl.load_dataframe(test_mosaiks_folder_path / "df_000.parquet.gzip")
data = result_df

In [None]:
# choose row
row = data.iloc[0]
row

In [None]:
# fetch image crop as per parameters in satellite_config (buffer, bands, etc)
from mosaiks.fetch.images import fetch_image_crop_from_stac_id

image_crop = fetch_image_crop_from_stac_id(
    stac_id=row["stac_id"],
    lon=row["Lon"],
    lat=row["Lat"],
    satellite_config=satellite_config,
    normalise=True,
    plot=True,
)

In [None]:
# # for multiple images (show individual images as opposed to mosaic) ######
# for stac_id in row["stac_id"]:
#     image_crop = fetch_image_crop_from_stac_id(
#         lon=row["Lon"],
#         lat=row["Lat"],
#         stac_id=stac_id,
#         satellite_config=satellite_config,
#         normalise=True,
#         plot=True,
#     )

# Dask runs

In [None]:
### Connect to LOCAL CLIENT
from mosaiks.dask import get_local_dask_cluster_and_client
client = get_local_dask_cluster_and_client(
    featurization_config["dask"]["n_workers"],
    featurization_config["dask"]["threads_per_worker"],
)

In [None]:
### or connect to GATEWAY
# from mosaiks.dask import get_gateway_cluster_client
# cluster, client = get_gateway_cluster_client()
# cluster

In [None]:
client

In [None]:
mosaiks_folder_path = Path("test_outputs") #utl.make_output_folder_path(featurization_config)
os.makedirs(mosaiks_folder_path, exist_ok=True)

## Method 1 (Preferred) - Queued Futures

In [None]:
from mosaiks.dask import run_queued_futures_pipeline

In [None]:
# %%time

# note that stopping this cell does not stop the dask cluster processing what
# is currently submitted. Use client.restart().
run_queued_futures_pipeline(
    test_points_gdf,
    client=client,
    model=model,
    featurization_config=featurization_config,
    satellite_config=satellite_config,
    col_names=mosaiks_col_names,
    output_folderpath=mosaiks_folder_path,
)

## Method 2 - Batched Delayed

In [None]:
from mosaiks.dask import run_batched_pipeline

In [None]:
%%time

# note that stopping this cell does not stop the dask cluster processing what
# is currently submitted. Use client.restart().
run_batched_pipeline(
    test_points_gdf,
    client=client,
    model=model,
    featurization_config=featurization_config,
    satellite_config=satellite_config,
    col_names=mosaiks_col_names,
    output_folderpath=mosaiks_folder_path,
)

## Method 3 - Unbatched Delayed

In [None]:
from mosaiks.dask import delayed_pipeline, run_unbatched_delayed_pipeline

### Single task

In [None]:
delayed_task = delayed_pipeline(
    test_points_gdf,
    model,
    featurization_config,
    satellite_config,
    mosaiks_col_names,
    test_mosaiks_folder_path,
    "TEST_dask_delayed.csv",
)

In [None]:
delayed_task.visualize(filename=f"{test_mosaiks_folder_path}/TEST_dask_graph.png")

In [None]:
delayed_task.compute()

### Full run

In [None]:
delayed_task_list = run_unbatched_delayed_pipeline(
    points_gdf=test_points_gdf,
    client=client,
    model=model,
    featurization_config=featurization_config,
    satellite_config=satellite_config,
    col_names=mosaiks_col_names,
    output_folderpath=mosaiks_folder_path,
)

# Load checkpoint files and combine

In [None]:
# # simple test
# data = utl.load_dataframe(mosaiks_folder_path / "df_000.parquet.gzip")
# data

In [None]:
checkpoint_filenames = utl.get_filtered_filenames(
    folder_path=mosaiks_folder_path, prefix="df_"
)
combined_df = utl.load_and_combine_dataframes(
    folder_path=mosaiks_folder_path, filenames=checkpoint_filenames
)
print(f"Dataset size in memory (MB): {combined_df.memory_usage().sum() / 1000000}")

In [None]:
combined_df

In [None]:
combined_filepath = mosaiks_folder_path / "combined_features.parquet.gzip"
utl.save_dataframe(df=combined_df, file_path=combined_filepath)