In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from pprint import pprint
from datetime import date, timedelta, datetime
import requests
import xarray as xr
import rioxarray
from shapely.geometry import mapping
import re
from dateutil import parser

import earthaccess

## Load in data

In [9]:
#version of data with cleaned date ranges of sampling
df = pd.read_csv('../agg_data_cleaned.csv')

#grab unique combinations of regions and dates so that we don't need to calculation aggregations more than necessary
sampling_df = df[['Oceanographic province', 'parsed_date']].drop_duplicates().reset_index(drop=True)

#Pull in longhurst geometry data
longhurst = gpd.read_file('../Savoca/Longhurst_world_v4_2010.shp')

print(longhurst.total_bounds)
longhurst = longhurst.cx[:, -37.0:37.0]
print(longhurst.total_bounds)

#Join longhurst data with sampling configurations
sampling_df = pd.merge(sampling_df, longhurst, left_on='Oceanographic province', right_on='ProvCode', how='left')

[-180.          -78.50015648  180.           90.00000191]
[-180.   -55.5  180.    66.5]


### TODO:

Load in satellite data

## Aggregator function

In [None]:
if mp_sat_data.rio.crs is not None:
    print(f"Dataset CRS: {mp_sat_data.rio.crs}")
else:
    print("Dataset CRS is not set.")

mp_sat_data = mp_sat_data.rio.write_crs("EPSG:4326", inplace=True)

print(longhurst.total_bounds) # Returns [minx, miny, maxx, maxy]

print(f"Latitude range: {mp_sat_data.lat.min().values} to {mp_sat_data.lat.max().values}")
print(f"Longitude range: {mp_sat_data.lon.min().values} to {mp_sat_data.lon.max().values}")

mp_sat_data = mp_sat_data.assign_coords(lon=(((mp_sat_data.lon + 180) % 360) - 180)).sortby('lon')

mp_sat_data = mp_sat_data.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)

In [None]:
def aggregate_metrics_single(ds, row, metrics=None):
    """
    Aggregates specified metrics over a single polygonal region within a dataset for a given timeframe.

    Parameters:
        ds (xarray.Dataset): The xarray dataset with spatial dimensions 'lat' and 'lon'.
        row (pd.Series): Row containing polygon geometry, region identifier, and timeframe.
        metrics (list of str, optional): List of metrics to aggregate (default: all in the dataset).

    Returns:
        dict: Aggregated metrics for the region in the given row.
    """
    
    # Set the default metrics to all variables in the dataset if not specified
    if metrics is None:
        metrics = list(ds.data_vars)
    
    # Subset the dataset based on the specified timeframe from the row
    timeframe = row['parsed_date']
    start, end = timeframe
    start = (pd.Timestamp(start) - pd.DateOffset(years=1))
    ds_subset = ds.sel(time=slice(start, end))
    
    # Ensure spatial dimensions and CRS are set
    ds_subset = ds_subset.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
    if ds_subset.rio.crs is None:
        ds_subset = ds_subset.rio.write_crs("EPSG:4326", inplace=True)
    
    # Initialize result dictionary for the current region
    region_results = {'region': row['ProvCode']}  # Adjust 'ProvCode' to match actual identifier column name
    
    # Clip dataset by current polygon
    masked_ds = ds_subset.rio.clip([mapping(row.geometry)], ds_subset.rio.crs, drop=True)
    
    # Aggregate each specified metric over the masked area
    for metric in metrics:
        #Unsure about using time here
        mean_value = masked_ds[metric].mean(dim=['lat', 'lon', 'time'], skipna=True).compute()
        region_results[f'mean_{metric}'] = mean_value.values
    
    return region_results