# Ensemble tide model tide guage validation

This code compares tides modelled using custom ensemble tide modelling against results from various other global ocean tide models at [Global Extreme Sea Level Analysis (GESLA) tide gauges](https://gesla787883612.wordpress.com/) across Australia.

> Haigh, I.D., Marcos, M., Talke, S.A., Woodworth, P.L., Hunter, J.R., Hague, B.S., Arns, A., Bradshaw, E. and Thompson, P., 2023. GESLA version 3: A major update to the global higher‐frequency sea‐level dataset. Geoscience Data Journal, 10(3), pp.293-314.

## Getting started
Set working directory to top level of repo to ensure links work correctly:

In [1]:
cd ../..

/home/jovyan/Robbi/dea-intertidal


Install additional packages:

In [None]:
!pip install -e /home/jovyan/Robbi/dea-notebooks/Tools/

### Load packages

In [2]:
%load_ext autoreload
%autoreload 2

import os
import glob
import warnings
import datetime
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt

from odc.geo.geom import BoundingBox

from dea_tools.validation import eval_metrics
from intertidal.utils import round_date_strings

os.environ["DEA_TOOLS_TIDE_MODELS"] = "/home/jovyan/tide_models_clipped"

In [3]:

def _load_gauge_metadata(metadata_path):
    
    # Load metadata
    metadata_df = pd.read_csv(metadata_path)
    metadata_df.columns = (
        metadata_df.columns.str.replace(" ", "_", regex=False)
        .str.replace("(", "", regex=False)
        .str.replace(")", "", regex=False)
        .str.replace("/", "_", regex=False)
        .str.lower()
    )
    metadata_df = metadata_df.set_index("site_code")

    # Convert metadata to GeoDataFrame
    metadata_gdf = gpd.GeoDataFrame(
        data=metadata_df,
        geometry=gpd.points_from_xy(metadata_df.longitude, metadata_df.latitude),
        crs="EPSG:4326",
    )
    
    return metadata_df, metadata_gdf


def tide_gauge_abslmp(
    x=None,
    y=None,
    site_code=None,
    time=("2020", "2021"),
    ahd=True,
    site_metadata=True,
    data_path="/gdata1/data/sea_level/abslmp/",
    metadata_path="/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv",
):
    """
    Load and process Australian Baseline Sea Level Monitoring Program
    (ABSLMP) tide gauge data.

    Parameters
    ----------
    x, y : tuple, optional
        Tuples defining the x and y bounding box within which to load
        tide gauge data, in WGS84 (degrees latitude, longitude) units.
        Leave as None if providing a list of site codes using 'site_code'.
    site_code : str or list of str, optional
        ABSLMP site code(s) for which to load data. If provided, 'x' and
        'y' will be ignored.
    time : tuple or list of str, optional
        Time range to consider, given as a tuple of start and end years.
        If None, will default to all tide observations from 1991 onward.
        Default is ("2020", "2021").
    ahd : bool, optional
        Whether to correct sea level to Australian Height Datum (AHD).
        Default is True.
    site_metadata : bool, optional
        Whether to add tide gauge station metadata as additional columns
        in the output DataFrame. Defaults to True.
    data_path : str, optional
        Path to the raw ABSLMP data files. Default is
        "/gdata1/data/sea_level/abslmp/".
    metadata_path : str, optional
        Path to the ABSLMP station metadata file.
        Default is "/gdata1/data/sea_level/ABSLMP_station_metadata_v2.csv".

    Returns
    -------
    pd.DataFrame
        Processed ABSLMP data as a DataFrame with columns including:
        "time": Timestamps,
        "sea_level": Observed sea level (m),
        "residuals": Residuals data (m),
        and additional columns from station metadata.
    """

    def _load_abslmp_dataset(path, na_value):
        abslmp_df = (
            pd.read_csv(
                path,
                parse_dates=[" Date & UTC Time"],
                na_values=na_value,
                usecols=[" Date & UTC Time", "Sea Level", "Residuals"],
            )
            .rename(
                {
                    " Date & UTC Time": "time",
                    "Sea Level": "sea_level",
                    "Residuals": "residuals",
                },
                axis=1,
            )
            .assign(site_code=path[-17:-9])
            .set_index("time")
        )

        return abslmp_df

    # Load tide gauge metadata
    metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)

    # Use supplied site codes if available
    if site_code is not None:
        site_code = [site_code] if isinstance(site_code, str) else site_code

    # Otherwise, use xy bounds to identify sites
    elif x is not None:
        bbox = BoundingBox.from_xy(x, y)
        site_code = metadata_gdf.cx[
            bbox.left : bbox.right, bbox.top : bbox.bottom
        ].index

    # Otherwise, return all available site codes
    else:
        site_code = metadata_df.index.to_list()

    # Prepare times
    if time is None:
        time = ["1991", str(datetime.datetime.now().year)]
    time = [time] if isinstance(time, str) else time
    start_time = round_date_strings(time[0], round_type="start")
    end_time = round_date_strings(time[-1], round_type="end")

    # Identify paths to load and nodata values for each site
    years = list(range(int(start_time[0:4]), int(end_time[0:4]) + 1))
    paths_na = [
        (glob.glob(f"{data_path}/{s}_*{y}.csv"), metadata_df.loc[s].null_value)
        for y in years
        for s in site_code
    ]

    # Expand so we have a nodata value for each path, then load and
    # combine into a single dataframe
    paths_na = [(path, na) for paths, na in paths_na for path in paths]
    data_df = (
        pd.concat([_load_abslmp_dataset(path, na_value=na) for path, na in paths_na])
        .loc[slice(start_time, end_time)]
        .reset_index()
        .set_index("site_code")
    )

    # Insert metadata into dataframe
    data_df[metadata_df.columns] = metadata_df

    # Add time to index and remove duplicates
    data_df = data_df.set_index("time", append=True)
    duplicates = data_df.index.duplicated()
    if duplicates.sum() > 0:
        warnings.warn("Duplicate timestamps were removed.")
        data_df = data_df.loc[~duplicates]

    # Correct to AHD (i.e. mean sea level)
    if ahd:
        data_df["sea_level"] -= data_df.ahd

    # Return data
    if not site_metadata:
        return data_df[["sea_level", "residuals"]]
    else:
        return data_df


from tqdm import tqdm
    
    
def tide_gauge_gesla(
    x=None,
    y=None,
    site_code=None,
    time=("2020", "2021"),
    filter_use_flag=True,
    site_metadata=True,
    data_path="/gdata1/data/sea_level/gesla/",
    metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
):
    """
    Load and process Global Extreme Sea Level Analysis (GESLA) tide
    gauge data.

    Modified from original code from https://github.com/philiprt/GeslaDataset.

    Parameters
    ----------
    x, y : tuple, optional
        Tuples defining the x and y bounding box within which to load
        tide gauge data, in WGS84 (degrees latitude, longitude) units.
        Leave as None if providing a list of site codes using 'site_code'.
    site_code : str or list of str, optional
        GESLA site code(s) for which to load data. If provided, 'x' and
        'y' will be ignored.
    time : tuple or list of str, optional
        Time range to consider, given as a tuple of start and end years.
        If None, will default to all tide observations from 1800 onward.
        Default is ("2020", "2021").
    filter_use_flag : bool, optional
        Whether to filter out low quality observations with a "use_flag"
        value of 0 (do not use). Defaults to True.
    site_metadata : bool, optional
        Whether to add tide gauge station metadata as additional columns
        in the output DataFrame. Defaults to True.
    data_path : str, optional
        Path to the raw GESLA data files. Default is
        "/gdata1/data/sea_level/gesla/".
    metadata_path : str, optional
        Path to the GESLA station metadata file.
        Default is "/gdata1/data/sea_level/GESLA3_ALL 2.csv".

    Returns
    -------
    pd.DataFrame
        Processed GESLA data as a DataFrame with columns including:
        "time": Timestamps,
        "sea_level": Observed sea level (m),
        "qc_flag": Observed sea level QC flag,
        "use_flag": Use-in-analysis flag (1 = use, 0 = do not use),
        and additional columns from station metadata.
    """

    def _load_gesla_dataset(site, path, na_value):
        gesla_df = (
            pd.read_csv(
                path,
                skiprows=41,
                names=["date_in", "time_in", "sea_level", "qc_flag", "use_flag"],
                sep="\s+",
                na_values=na_value,
            )
            .assign(
                time=lambda df: pd.to_datetime(df["date_in"] + " " + df["time_in"]),
                site_code=site,
            )
            .set_index("time")
            .drop(["date_in", "time_in"], axis=1)
        )
        
        print(path)

        return gesla_df

    
    # Load tide gauge metadata
    metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)    

    # Use supplied site codes if available
    if site_code is not None:
        site_code = [site_code] if isinstance(site_code, str) else site_code

    # Otherwise, use xy bounds to identify sites
    elif x is not None:
        bbox = BoundingBox.from_xy(x, y)
        site_code = metadata_gdf.cx[
            bbox.left : bbox.right, bbox.top : bbox.bottom
        ].index

    # Otherwise, return all available site codes
    else:
        site_code = metadata_df.index.to_list()

    # Prepare times
    if time is None:
        time = ["1800", str(datetime.datetime.now().year)]
    time = [time] if isinstance(time, str) else time
    start_time = round_date_strings(time[0], round_type="start")
    end_time = round_date_strings(time[-1], round_type="end")

    # Identify paths to load and nodata values for each site
    metadata_df["file_name"] = data_path + metadata_df["file_name"]
    paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]]

    # Load and combine into a single dataframe
    data_df = (
        pd.concat(
            [
                _load_gesla_dataset(s, p, na_value=na)
                for s, p, na in paths_na.itertuples()
            ]
        )
        .sort_index()
        .loc[slice(start_time, end_time)]
        .reset_index()
        .set_index("site_code")
    )

    # Optionally filter by use flag column
    if filter_use_flag:
        data_df = data_df.loc[data_df.use_flag == 1]

    # Optionally insert metadata into dataframe
    if site_metadata:
        data_df[metadata_df.columns] = metadata_df.loc[site_code]

    # Add time to index and remove duplicates
    data_df = data_df.set_index("time", append=True)
    duplicates = data_df.index.duplicated()
    if duplicates.sum() > 0:
        warnings.warn("Duplicate timestamps were removed.")
        data_df = data_df.loc[~duplicates]

    # Return data
    return data_df


# tide_gauge_abslmp(x=(140, 160), y=(-30, -35))
# tide_gauge_gesla(x=(140, 160), y=(-30, -35))

### GESLA tide guage data


In [None]:
# # Load tide gauge metadata
# metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv") 
# metadata_gdf.to_file("gesla_stations.geojson")

In [None]:
# Load Collection 3 summary grid
c3_path = "https://data.dea.ga.gov.au/derivative/ga_summary_grid_c3.geojson"
c3_grid = gpd.read_file(c3_path)
xmin, ymin, xmax, ymax = c3_grid.total_bounds

In [None]:
# Extract tide data for all sites
gauge_df = tide_gauge_gesla(x=(xmin, xmax), y=(ymax, ymin), time=("2017", "2019"))

In [None]:
# Clean to restrict to Australia and remove duplicate sites/contributers
bad_sites = ("H033007A",  # Cape Ferguson, duplicate
             "H057022B",  # Thursday Island, duplicate
             "PLPEE01",  # Peel Inlet, inland
             "DVHAR01", # Harvey, inland
             "H060010A") # Half Tide Tug, duplicate 
bad_contributers = ("UHSLC",)
gauge_df = gauge_df.query(
    f"(country == 'AUS') & "
    f"(site_code not in {bad_sites}) & "
    f"(contributor_abbreviated not in {bad_contributers})"
)

# Normalise to mean sea level
gauge_df["sea_level"] -= gauge_df.groupby(["site_code"])["sea_level"].transform("mean")

# Select 3-hourly subset
# gauge_df = gauge_df.iloc[::3]

In [None]:
# Export stations being processed to file
sites_df = gauge_df.groupby("site_code").first().iloc[:, 3:]
sites_gdf = gpd.GeoDataFrame(data=sites_df, geometry=gpd.points_from_xy(sites_df.longitude, sites_df.latitude))
sites_gdf.to_file("gesla_stations_aus.geojson")

## Model tides at each gauge

In [None]:
from dea_tools.coastal import model_tides

ensemble_funcs = {
    "ensemble-top": lambda x: x["rank"] == 1,
    "ensemble-bottom": lambda x: x["rank"] == 7,
    "ensemble-mean-top3": lambda x: x["rank"] <= 3,
    "ensemble-mean-top5": lambda x: x["rank"] <= 5,
    "ensemble-mean-weighted": lambda x: 8 - x["rank"],
    "ensemble-mean": lambda x: x["rank"] <= 7,
}

tide_df = model_tides(
    x=gauge_df.longitude,
    y=gauge_df.latitude,
    time=gauge_df.index.get_level_values("time"),
    model=[
        "FES2014",
        "FES2012",
        "TPXO8-atlas-v1",
        "TPXO9-atlas-v5",
        "EOT20",
        "HAMTIDE11",
        "GOT4.10",
        "ensemble",
    ],
    mode="one-to-one",
    parallel_splits=100,
    output_format="wide",
    ensemble_func=ensemble_funcs,
).rename(
    {
        "TPXO9-atlas-v5": "TPXO9",
        "TPXO8-atlas": "TPXO8",
        "TPXO8-atlas-v1": "TPXO8",
    },
    axis=1,
)
tide_df

Pre-process into long format:

In [None]:
# Add tide gauge data to dataframe
tide_df["site_code"] = gauge_df.index.get_level_values("site_code").values
tide_df["site_name"] = gauge_df.site_name.values
tide_df["tide_gauge"] = gauge_df.sea_level.values

# Reshape to long format
tide_df_long = tide_df.melt(
    ignore_index=False,
    id_vars=["tide_gauge", "site_code", "site_name"],
    value_vars=[
        "EOT20",
        "FES2012",
        "FES2014",
        "GOT4.10",
        "HAMTIDE11",
        "TPXO9",
        "TPXO8",
        "ensemble-top",
        "ensemble-bottom",
        "ensemble-mean-top3",
        # "ensemble-mean-top5",
        # "ensemble-mean-weighted",
        "ensemble-mean",
    ],
    value_name="tide_m",
)

tide_df_long

## Analysis


### Overall accuracy stats for each model

In [None]:
accuracy_df = tide_df_long.groupby(["tide_model"])[["tide_gauge", "tide_m"]].apply(
    lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4)
)
accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r",
    subset=["RMSE"],
)

### Model stats per site

In [None]:
accuracy_sites_df = tide_df_long.groupby(["site_name", "tide_model"])[
    ["tide_gauge", "tide_m"]
].apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4))

#### Performance at sites with most model disagreement

In [None]:
# Standard models
default_models = [
    "FES2014",
    "FES2012",
    "TPXO8",
    "TPXO9",
    "EOT20",
    "HAMTIDE11",
    "GOT4.10",
]

# Identify sites with highest standard deviation / disagrement
most_disagrement = (
    accuracy_sites_df.query("tide_model in @default_models")
    .groupby("site_name")["RMSE"]
    .std()
    .nlargest(10)
    .index
)
most_disagrement

In [None]:
accuracy_df = (
    tide_df_long.query("site_name in @most_disagrement")
    .groupby(["tide_model"])[["tide_gauge", "tide_m"]]
    .apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4))
)
accuracy_df.sort_values("RMSE").sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r",
    subset=["RMSE"],
)

#### Top results per site per model

In [None]:
accuracy_sites_df.loc[
    accuracy_sites_df
    .groupby("site_name")
    .RMSE.idxmin()
].reset_index().tide_model.value_counts()

In [None]:
# Excluding EOT20
accuracy_sites_df.loc[
    accuracy_sites_df.query(
        "tide_model not in ['EOT20']"
    )
    .groupby("site_name")
    .RMSE.idxmin()
].reset_index().tide_model.value_counts()

#### Bottom results per site per model

In [None]:
accuracy_sites_df.loc[
    accuracy_sites_df
    .groupby("site_name")
    .RMSE.idxmax()
].reset_index().tide_model.value_counts()

#### Plot rankings as histogram

In [None]:
rank_counts = (
    accuracy_sites_df.groupby("site_name")
    .RMSE.rank(method="min")
    .groupby("tide_model")
    .value_counts()
    .rename_axis(["tide_model", "Rank (RMSE)"])
)

g = sns.FacetGrid(
    rank_counts.to_frame().reset_index(),
    col_order=[
        "EOT20",
        "ensemble-mean-top3",
        "TPXO9",
        "FES2014",
        "TPXO8",
        "FES2012",
        "HAMTIDE11",
        "GOT4.10",
    ],
    col="tide_model",
    aspect=0.7,
    height=4,
)
g.set_titles(col_template="{col_name}")
g.map(sns.barplot, "Rank (RMSE)", "count")

### Export tide validation plots

In [None]:
# # Calculate tide range per site
# tide_range = (
#     tide_df_long.groupby("site_name")["tide_gauge"]
#     .apply(lambda x: np.abs(x).max())
#     .sort_values()
# )

# Select subset
# sites, limits, title = tide_range.loc[tide_range >= 2].index, 5, "Macrotidal sites"
# sites, limits, title = tide_range.loc[(tide_range >= 1) & (tide_range < 2)].index, 2, "Mesotidal sites"
# sites, limits, title = tide_range.loc[tide_range < 1].index, 1, "Microtidal sites"

# Optionally restrict to subset
site_filter = [
    # "DYDBY01",
    # "63090",
    # "59511",
    # "59510",
    # "61800",
    # "59690",
    # "61840",
    # "61600",
    # "58170",
    # "60780",
    # "60739",
    # "60590",
    # "63511",
    # "59980",
    # "60710",
    # "60730",
    "59850",
]
site_filter = most_disagrement.tolist()
# site_filter = ["Milner_Bay_Groote_Eylandt"]
sites, limits, title = site_filter, 3, "Problematic sites" 
models_to_plot = ['EOT20', 'FES2012', 'FES2014', 'GOT4.10', 'HAMTIDE11', 
                  'TPXO9', 'TPXO8', 'ensemble-mean-top3']

# Plot facetted
g = sns.FacetGrid(
     tide_df_long.query(f"(site_name in {sites}) & (tide_model in @models_to_plot)").reset_index(),
    col="tide_model",
    row="site_name",
    margin_titles=True,
    # xlim=(-limits, limits),
    # ylim=(-limits, limits),
)
g.fig.suptitle(title, size=20)
g.set_titles(row_template="{row_name}", col_template="{col_name}")
g.map(sns.scatterplot, "tide_gauge", "tide_m", alpha=0.1, linewidth=0, s=3)
for a in g.axes.flat:
    a.plot([-limits, limits], [-limits, limits], "--", c="black")
g.savefig(f"{title.replace(' ', '')}.jpg")

In [4]:
points_gdf = gpd.read_file("https://data.dea.ga.gov.au/derivative/dea_coastlines/2-1-0/coastlines_v2.1.0.gpkg", layer="hotspots_zoom_2")

In [5]:
states_gdf = gpd.read_file("/gdata1/data/boundaries/GEODATA_COAST_100K/new_south_wales/cstnswcd_r.shp").to_crs(points_gdf.crs)

In [6]:
points_nsw_gdf = points_gdf.clip(states_gdf)
points_nsw_gdf

Unnamed: 0,uid,dist_1988,dist_1989,dist_1990,dist_1991,dist_1992,dist_1993,dist_1994,dist_1995,dist_1996,...,dist_2021,dist_2022,rate_time,sig_time,se_time,outl_time,radius_m,certainty,n,geometry
21062,r3due5t3kvh,9.05,7.46,6.52,10.42,6.34,9.64,11.35,7.74,7.85,...,4.04,0.0,-0.131,0.002,0.038,,5000,good,224,POINT (1644391.729 -4023395.740)
21057,r3dud4vqgz9,5.29,3.05,0.94,3.72,6.24,5.31,7.75,6.00,3.89,...,2.89,0.0,-0.051,0.126,0.032,,5000,good,235,POINT (1640406.260 -4023177.782)
21058,r3dudjmd2jg,8.32,6.37,5.32,8.49,4.51,8.63,9.50,7.66,7.08,...,3.91,0.0,-0.121,0.001,0.032,,5000,good,525,POINT (1640627.167 -4021756.684)
21059,r3du9zpx7bf,7.62,5.94,4.38,7.24,3.58,7.80,8.75,7.32,6.88,...,3.73,0.0,-0.107,0.002,0.032,,5000,good,544,POINT (1640068.843 -4020504.987)
21041,r3dug3msvg7,11.89,9.93,11.62,16.12,3.17,10.74,11.56,9.94,10.57,...,5.61,0.0,-0.154,0.014,0.059,,5000,good,228,POINT (1645918.102 -4020005.756)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21052,r3du63p9kxs,6.69,5.90,4.65,9.18,6.86,8.57,8.62,8.64,6.08,...,4.41,0.0,-0.130,0.000,0.032,,5000,good,325,POINT (1640808.347 -4029348.805)
21053,r3du6drq9ns,8.70,7.50,6.48,10.58,9.72,11.90,11.24,10.71,8.87,...,4.78,0.0,-0.152,0.001,0.042,,5000,good,216,POINT (1641869.059 -4028630.665)
21054,r3du6sqj6rv,8.22,6.96,5.66,9.56,10.61,12.75,14.08,10.84,9.21,...,5.18,0.0,-0.131,0.008,0.046,,5000,good,216,POINT (1641890.805 -4027424.812)
21055,r3du6wwkh7z,7.94,6.49,4.73,8.45,11.27,12.86,14.95,10.57,8.82,...,4.21,0.0,-0.125,0.016,0.049,,5000,good,183,POINT (1642120.913 -4026097.963)


In [7]:
from dea_tools.coastal import model_tides

tide_df = model_tides(
    x=points_nsw_gdf.geometry.x,
    y=points_nsw_gdf.geometry.y,
    time=pd.date_range("2006-01-01", "2023-12-31", freq="1h"),
    crs=points_gdf.crs,
    model=[
        "FES2014",
        "FES2012",
        "TPXO8-atlas-v1",
        "TPXO9-atlas-v5",
        "EOT20",
        "HAMTIDE11",
        "GOT4.10",
        "ensemble",
    ],
    # mode="one-to-one",
    parallel_splits=100,
    output_format="wide",
).rename(
    {
        "TPXO9-atlas-v5": "TPXO9",
        "TPXO8-atlas": "TPXO8",
        "TPXO8-atlas-v1": "TPXO8",
    },
    axis=1,
)
tide_df

Running ensemble tide modelling
Modelling tides using FES2014, TPXO9-atlas-v5, EOT20, HAMTIDE11, GOT4.10, FES2012, TPXO8-atlas-v1 in parallel


100%|██████████| 700/700 [20:09<00:00,  1.73s/it]  


Interpolating model rankings using IDW interpolation 
Combining models into single ensemble model
Converting to a wide format dataframe


Unnamed: 0_level_0,Unnamed: 1_level_0,tide_model,EOT20,FES2012,FES2014,GOT4.10,HAMTIDE11,TPXO8,TPXO9,ensemble
time,x,y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-01-01,1.589465e+06,-4.164731e+06,0.834172,0.859840,0.804386,0.781466,0.805511,0.796934,0.797103,0.830372
2006-01-01,1.589743e+06,-4.166606e+06,0.834172,0.862422,0.804391,0.781466,0.805511,0.796934,0.797103,0.831233
2006-01-01,1.590986e+06,-4.164078e+06,0.834172,0.859840,0.804393,0.781466,0.805511,0.795420,0.798352,0.830788
2006-01-01,1.591582e+06,-4.167364e+06,0.834172,0.862422,0.804424,0.781466,0.805511,0.796934,0.797103,0.831233
2006-01-01,1.591642e+06,-4.201466e+06,0.805139,0.858243,0.793298,0.747214,0.798356,0.773553,0.775793,0.813058
...,...,...,...,...,...,...,...,...,...,...
2023-12-31,2.081501e+06,-3.239350e+06,0.742824,0.773398,0.740712,0.764520,0.753914,0.775481,0.782922,0.753006
2023-12-31,2.081612e+06,-3.246330e+06,0.742824,0.770552,0.738057,0.764520,0.753914,0.773362,0.776337,0.751414
2023-12-31,2.081933e+06,-3.237272e+06,0.742824,0.773398,0.741339,0.764520,0.753914,0.775481,0.783023,0.753215
2023-12-31,2.082103e+06,-3.244006e+06,0.742824,0.770552,0.739010,0.764520,0.753914,0.773362,0.776337,0.751732


In [None]:
(tide_df * 1000).astype("int16").to_csv("nsw_tides_experimental_mm.csv")

In [10]:
tide_df_grouped = tide_df.groupby(["x", "y"])

In [17]:
tide_df_range = (tide_df_grouped.max() - tide_df_grouped.min()).round(3)

In [38]:
gpd.GeoDataFrame(
    data=tide_df_range,
    geometry=gpd.points_from_xy(
        x=tide_df_range.index.get_level_values("x"),
        y=tide_df_range.index.get_level_values("y"),
    ),
    crs=points_gdf.crs,
).to_crs("EPSG:4326").to_file("nsw_tide_range_experimental.geojson")

In [24]:
points_nsw_gdf.geometry.reset_index(drop=True)


0      POINT (1644391.729 -4023395.740)
1      POINT (1640406.260 -4023177.782)
2      POINT (1640627.167 -4021756.684)
3      POINT (1640068.843 -4020504.987)
4      POINT (1645918.102 -4020005.756)
                     ...               
937    POINT (1640808.347 -4029348.805)
938    POINT (1641869.059 -4028630.665)
939    POINT (1641890.805 -4027424.812)
940    POINT (1642120.913 -4026097.963)
941    POINT (1641120.901 -4024787.355)
Name: geometry, Length: 942, dtype: geometry

### Tide gauge weighting

In [None]:
models = ["EOT20", "FES2012", "FES2014", "GOT4.10", "HAMTIDE11", "TPXO8", "TPXO9"]

# Convert to wide
wide_df = accuracy_sites_df["RMSE"].unstack("tide_model").reset_index()
wide_df = wide_df.set_index("site_name")[models]
wide_df

#### Correlation weighting (alternative)

In [None]:
models = ["EOT20", "FES2012", "FES2014", "GOT4.10", "HAMTIDE11", "TPXO8", "TPXO9"]
corr_results_gesla = gpd.read_file("corr_results_gesla_v3.geojson")

wide_df = (
    corr_results_gesla
    .rename(
        {"point_id": "site_name", "TPXO8-atlas-v1": "TPXO8", "TPXO9-atlas-v5": "TPXO9"},
        axis=1,
    )
    .set_index("site_name")[models]
    .dropna(axis=0)
)

wide_df = 1 - wide_df

wide_df

### Weight application

In [None]:
def weighted_mean_top_n(gauge_df, altimetry_df, top_n=5):
    
    # Calculate ranks for each model
    ranks_df = altimetry_df.rank(axis=1, ascending=False, method="max")
    
    # Select top N models
    altimetry_weights = (ranks_df - (len(wide_df.columns) - top_n)).clip(0, top_n) ** 10
 
    # Normalise to sum to 1
    altimetry_weights = altimetry_weights.divide(altimetry_weights.sum(axis=1), axis=0)    

    return (gauge_df * altimetry_weights).sum(axis=1)
    

In [None]:
# Prepare data
test_df = tide_df.set_index("site_name", append=True).copy()
test_df["site_code"] = gauge_df.index.get_level_values("site_code").values
test_df["tide_gauge"] = gauge_df.sea_level.values

# Apply different weighting schemes
test_df["median"] = test_df[models].median(axis=1)
# test_df["median_top5"] = median_top_n(test_df[models], wide_df, top_n=5)
test_df["median_top3"] = median_top_n(test_df[models], wide_df, top_n=3)
test_df["mean"] = test_df[models].mean(axis=1)
# test_df["mean_top5"] = mean_top_n(test_df[models], wide_df, top_n=5)
test_df["mean_top3"] = mean_top_n(test_df[models], wide_df, top_n=3)

# test_df["mean_top2"] = mean_top_n(test_df[models], wide_df, top_n=2)
# test_df["top_model"] = mean_top_n(test_df[models], wide_df, top_n=1)
# test_df["weighted_mean_power2"] = weighted_mean_power(test_df[models], wide_df, power=2)
# test_df["weighted_mean_power5"] = weighted_mean_power(test_df[models], wide_df, power=5)
# test_df["weighted_mean_power10"] = weighted_mean_power(test_df[models], wide_df, power=10)
# test_df["weighted_mean_top3"] = weighted_mean_top_n(test_df[models], wide_df, top_n=3)
# test_df["weighted_mean_top5"] = weighted_mean_top_n(test_df[models], wide_df, top_n=5)

# Reshape to long format
tide_df_long = test_df.melt(
    ignore_index=False,
    id_vars=["tide_gauge", "site_code"],
    value_name="tide_m",
)

# Calculate statistics against tide gauge data
accuracy_df = tide_df_long.groupby(["tide_model"])[["tide_gauge", "tide_m"]].apply(
    lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m)
)
accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5
)

In [None]:
# Prepare data
test_df = tide_df.set_index("site_name", append=True).copy()
test_df["site_code"] = gauge_df.index.get_level_values("site_code").values
test_df["tide_gauge"] = gauge_df.sea_level.values

# Apply different weighting schemes
test_df["median"] = test_df[models].median(axis=1)
# test_df["median_top5"] = median_top_n(test_df[models], wide_df, top_n=5)
test_df["median_top3"] = median_top_n(test_df[models], wide_df, top_n=3)
test_df["mean"] = test_df[models].mean(axis=1)
# test_df["mean_top5"] = mean_top_n(test_df[models], wide_df, top_n=5)
test_df["mean_top3"] = mean_top_n(test_df[models], wide_df, top_n=3)

# test_df["mean_top2"] = mean_top_n(test_df[models], wide_df, top_n=2)
# test_df["top_model"] = mean_top_n(test_df[models], wide_df, top_n=1)
# test_df["weighted_mean_power2"] = weighted_mean_power(test_df[models], wide_df, power=2)
# test_df["weighted_mean_power5"] = weighted_mean_power(test_df[models], wide_df, power=5)
# test_df["weighted_mean_power10"] = weighted_mean_power(test_df[models], wide_df, power=10)
# test_df["weighted_mean_top3"] = weighted_mean_top_n(test_df[models], wide_df, top_n=3)
# test_df["weighted_mean_top5"] = weighted_mean_top_n(test_df[models], wide_df, top_n=5)

# Reshape to long format
tide_df_long = test_df.melt(
    ignore_index=False,
    id_vars=["tide_gauge", "site_code"],
    value_name="tide_m",
)

# Calculate statistics against tide gauge data
accuracy_df = tide_df_long.groupby(["tide_model"])[["tide_gauge", "tide_m"]].apply(
    lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m)
)
accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5
)

In [None]:
# Compare performance at worst sites
accuracy_sites_df = tide_df_long.dropna(axis=0).groupby(["tide_model", "site_code"])[
    ["tide_gauge", "tide_m"]
].apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m))

accuracy_sites_df.groupby(["tide_model"]).quantile(0.9).sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)


In [None]:
# Compare performance at worst sites
accuracy_sites_df = tide_df_long.dropna(axis=0).groupby(["tide_model", "site_code"])[
    ["tide_gauge", "tide_m"]
].apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m))

accuracy_sites_df.groupby(["tide_model"]).quantile(0.9).sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)



In [None]:
accuracy_sites_df.query('site_code == "60780"').sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)

## Altimetry comparisons

In [None]:
# Convert to GeoDataFrame
wide_df = accuracy_df["RMSE"].unstack("tide_model").reset_index()
wide_gdf = gpd.GeoDataFrame(
    data=wide_df, geometry=gpd.points_from_xy(x=wide_df.x, y=wide_df.y), crs="EPSG:4326"
)

In [None]:
# Load in altimetry data
xtrack_rms_gdf = gpd.read_file("/home/jovyan/altimetry/X-TRACK/xtrack_rms_all.geojson")
xtrack_rms_gdf

In [None]:
# gpd.sjoin_nearest(
#         gpd.GeoDataFrame(geometry=gpd.points_from_xy(x=[149.88344], y=[-22.33805], crs="EPSG:4326")),
#         xtrack_rms_gdf,
#         lsuffix="gauge",
#         rsuffix="",
#         how="left",
#         distance_col="dist",
#     )

In [None]:
model_names = ["EOT20", "FES2012", "FES2014", "GOT4.10", "HAMTIDE11", "TPXO8", "TPXO9"]

wide_df = wide_gdf.set_index("site_name")[model_names]
xtrack_df = (
    gpd.sjoin_nearest(
        wide_gdf[["site_name", "geometry"]],
        xtrack_rms_gdf,
        lsuffix="gauge",
        rsuffix="",
        how="left",
        distance_col="dist",
    )
    .set_index("site_name")
)

# Keep distances between gauge and altimetry points
distances = xtrack_df.dist
xtrack_df = xtrack_df[model_names] 

### Difference between altimetry and gauge RMS

In [None]:
with pd.option_context('display.max_rows', 500):
    display((xtrack_df - wide_df).reset_index().style.background_gradient(cmap='RdBu', axis=None, vmin=-0.3, vmax=0.3))

### RMS scatterplot

In [None]:
gauge_df_long = wide_df.assign(distance=distances).melt(
    id_vars=["distance"], value_name="Tide gauge (metres RMS)", ignore_index=False
)
gauge_df_long["Altimetry (metres RMS)"] = xtrack_df.melt(ignore_index=False).value

fig, ax = plt.subplots(figsize=(7, 7))
sns.scatterplot(
    ax=ax,
    data=gauge_df_long,
    x="Tide gauge (metres RMS)",
    y="Altimetry (metres RMS)",
    hue="site_name",
    s=50,
    palette=sns.color_palette("tab20"),
    edgecolor="black",
    style="tide_model",
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.plot([0.05, 2], [0.05, 2], "--", c="black")
ax.set_title(
    "Comparison of tide model performance at tide\ngauge and nearest altimetry observation"
);

In [None]:
import matplotlib 

fig, ax = plt.subplots(figsize=(7, 7))
sns.scatterplot(
    ax=ax,
    data=gauge_df_long,
    x="distance",
    y="Tide gauge (metres RMS)",
    hue="distance",
    # hue_norm=matplotlib.colors.LogNorm(),
    s=50,
    palette="magma",
    edgecolor="black",
    style="tide_model",
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
# ax.plot([0.05, 2], [0.05, 2], "--", c="black")
ax.set_title(
    "Comparison of tide model performance at tide\ngauge and nearest altimetry observation"
);

### Overall stats

In [None]:
eval_metrics(x=gauge_df_long["Tide gauge (metres RMS)"], 
             y=gauge_df_long["Altimetry (metres RMS)"])

### Agreement between top/bottom models

In [None]:
gauge_worst_n = gauge_df_long.groupby("site_name").apply(lambda x: x.nlargest(4, columns="Tide gauge (metres RMS)", keep='first')).set_index("tide_model", append=True)
xtrack_worst_n = gauge_df_long.groupby("site_name").apply(lambda x: x.nlargest(4, columns="Altimetry (metres RMS)", keep="first")).set_index("tide_model", append=True)

In [None]:
gauge_worst_n[["Tide gauge (metres RMS)"]].join(xtrack_worst_n[["Altimetry (metres RMS)"]], rsuffix="_altimetry").groupby("site_name").count()

### Spearman's correlation between gauge and altimetry per site
High values indicate both data sources sort models into the same RMS order

In [None]:
xtrack_df.corrwith(wide_df, axis=1, method="spearman").to_frame().style.background_gradient(cmap='RdBu', axis=None, vmin=-1, vmax=1)

## Weighting test

In [None]:
def weighted_median(values, weights):
    values = np.array(values)
    weights = np.array(weights)
    
    sort_indices = np.argsort(values)
    values_sorted = values[sort_indices]
    weights_sorted = weights[sort_indices]  

    cumsum = weights_sorted.cumsum()
    cutoff = weights_sorted.sum() / 2.
    return values_sorted[cumsum >= cutoff][0]


# def weighted_quantiles(values, weights, quantiles=0.5):
#     i = np.argsort(values)
#     c = np.cumsum(weights[i])
#     return values[i[np.searchsorted(c, np.array(quantiles) * c[-1])]]

# def weighted_median(df, val, weight):
#     df_sorted = df.sort_values(val)
#     cumsum = df_sorted[weight].cumsum()
#     cutoff = df_sorted[weight].sum() / 2.
#     return df_sorted[cumsum >= cutoff][val].iloc[0]

In [None]:
# weighted_median = test_df[model_names].apply(lambda x: weighted_median(x, xtrack_weights.loc[x.name[3]]), axis=1)

In [None]:
def weighted_mean_power(gauge_df, altimetry_df, power=5):
    
    # Baseline differences to best performing model, then calculate 
    # power weighting
    x = altimetry_df.subtract(altimetry_df.min(axis=1), axis=0)
    altimetry_weights = (1 - x.clip(0, 1)) ** power
    
    # Normalise to sum to 1
    altimetry_weights = altimetry_weights.divide(altimetry_weights.sum(axis=1), axis=0)       
    
    return (gauge_df * altimetry_weights).sum(axis=1)

def weighted_mean_power_top_n(gauge_df, altimetry_df, power=5, top_n=3):
    
    # Calculate ranks for each model
    ranks_df = altimetry_df.rank(axis=1)
    
    # Baseline differences to best performing model, then calculate 
    # power weighting
    x = altimetry_df.subtract(altimetry_df.min(axis=1), axis=0)
    altimetry_weights = (1 - x.clip(0, 1)) ** power
    altimetry_weights = altimetry_weights.where(ranks_df <= top_n, 0)
    
    # Normalise to sum to 1
    altimetry_weights = altimetry_weights.divide(altimetry_weights.sum(axis=1), axis=0)       
    
    return (gauge_df * altimetry_weights).sum(axis=1)


def mean_top_n(gauge_df, altimetry_df, top_n=5):
    
    # Calculate ranks for each model
    ranks_df = altimetry_df.rank(axis=1)
    
    # Calculate mean of top ranked models
    return gauge_df.where(ranks_df <= top_n).mean(axis=1)


def median_top_n(gauge_df, altimetry_df, top_n=5):
    
    # Calculate ranks for each model
    ranks_df = altimetry_df.rank(axis=1)
    
    # Calculate mean of top ranked models
    return gauge_df.where(ranks_df <= top_n).median(axis=1)    
    

In [None]:
# Prepare data
test_df = tide_df.set_index("site_name", append=True).copy()
test_df["site_code"] = gauge_df.index.get_level_values("site_code").values
test_df["tide_gauge"] = gauge_df.sea_level.values

# Apply different weighting schemes
test_df["median"] = test_df[model_names].median(axis=1)
test_df["median_top5"] = median_top_n(test_df[model_names], xtrack_df, top_n=5)
test_df["median_top3"] = median_top_n(test_df[model_names], xtrack_df, top_n=3)
test_df["mean"] = test_df[model_names].mean(axis=1)
test_df["mean_top5"] = mean_top_n(test_df[model_names], xtrack_df, top_n=5)
test_df["mean_top3"] = mean_top_n(test_df[model_names], xtrack_df, top_n=3)
test_df["weighted_mean_power2"] = weighted_mean_power(
    test_df[model_names], xtrack_df, power=2
)
test_df["weighted_mean_power5"] = weighted_mean_power(
    test_df[model_names], xtrack_df, power=5
)
test_df["weighted_mean_power10"] = weighted_mean_power(
    test_df[model_names], xtrack_df, power=10
)
test_df["weighted_mean_power5_top3"] = weighted_mean_power_top_n(test_df[model_names], xtrack_df, power=5, top_n=3)
test_df["weighted_mean_power5_top5"] = weighted_mean_power_top_n(test_df[model_names], xtrack_df, power=5, top_n=5)
test_df["weighted_mean_power10_top3"] = weighted_mean_power_top_n(test_df[model_names], xtrack_df, power=10, top_n=3)
test_df["weighted_mean_power10_top5"] = weighted_mean_power_top_n(test_df[model_names], xtrack_df, power=10, top_n=5)

# Reshape to long format
tide_df_long = test_df.melt(
    ignore_index=False,
    id_vars=["tide_gauge", "site_code"],
    value_name="tide_m",
)

# Calculate statistics against tide gauge data
accuracy_df = tide_df_long.groupby(["tide_model"])[["tide_gauge", "tide_m"]].apply(
    lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m)
)
accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5
)

In [None]:
# Compare performance at worst sites
accuracy_sites_df = tide_df_long.groupby(["tide_model", "site_code"])[
    ["tide_gauge", "tide_m"]
].apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m))
accuracy_sites_df.groupby(["tide_model"]).quantile(0.9).sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)


In [None]:
accuracy_sites_df.query('site_code == "60710"').sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)

In [None]:
accuracy_sites_df.groupby(["tide_model"]).quantile(0.95).sort_values(
    "RMSE"
).style.background_gradient(cmap="RdBu_r", subset=["RMSE"], vmin=0.15, vmax=0.5)

In [None]:
ranked_sites_df = accuracy_sites_df.groupby("site_code")["RMSE"].transform(func='rank').subtract(0).reset_index()

sns.displot(
    ranked_sites_df, x="RMSE", col="tide_model", col_wrap=4,
    binwidth=1, height=3, facet_kws=dict(margin_titles=True),
)

In [None]:
sns.displot(
    accuracy_sites_df.clip(0,1), x="RMSE", col="tide_model", col_wrap=4,
    binwidth=0.05, height=3, facet_kws=dict(margin_titles=True),
)

In [None]:
# Best results overall
accuracy_sites_df.loc[accuracy_sites_df.query("tide_model != 'EOT20'").groupby("site_name").RMSE.idxmin()
].reset_index().tide_model.value_counts()
    

In [None]:

# Step 1: Group by the first level of the multi-index ('y')
grouped = accuracy_sites_df.groupby(level=0)

# Step 2: Sort within each group by "x" in descending order
sorted_groups = grouped.apply(lambda x: x.sort_values("RMSE", ascending=False))

# sorted_groups.reset_index(inplace=True)

# Step 3: Retrieve the second highest value for each group
second_highest_per_group = sorted_groups.groupby(level=0).nth(1).reset_index()

# Display the result
print(second_highest_per_group)

In [None]:
# Step 1: Group by the first level of the multi-index ('y')
grouped = accuracy_sites_df.groupby(level='site_name')

# Step 2: Sort within each group by "x" in descending order and get the second highest
second_highest_per_group = grouped.apply(lambda x: x.nlargest(2, 'RMSE')).groupby('site_name').nth(1).reset_index()

# Display the result
print(second_highest_per_group)

In [None]:
# Best results overall
accuracy_sites_df.loc[
    accuracy_sites_df.sort_values("RMSE").groupby("site_name").RMSE.nth(0).index
].reset_index().tide_model.value_counts()

In [None]:
accuracy_sites_df.sort_values("RMSE").groupby("site_name").RMSE.apply(lambda t: t.iloc[1])

In [None]:
with pd.option_context("display.max_rows", 500):
    display(xtrack_df)

In [None]:
with pd.option_context("display.max_rows", 500):
    display(
        xtrack_weights.style.background_gradient(
            cmap="RdBu", axis=None, vmin=0, vmax=0.2
        )
    )

In [None]:
xtrack_weights.mean(axis=0).to_frame("Average weighting").sort_values("Average weighting", ascending=False)

### Test aggregation

In [None]:
xtrack_df

In [None]:
((gauge_df_long.groupby(
    "site_name"
).mean() / gauge_df_long.groupby("site_name").apply(
    lambda x: x.nsmallest(5, columns="X-TRACK altimetry (metres RMS)", keep="first")
).droplevel(level=1).groupby("site_name").mean() - 1.0)[["ABSLMP tide gauge (metres RMS)"]] * 100).style.background_gradient(cmap='RdBu', axis=None, vmin=-50, vmax=50)

In [None]:
gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank()

In [None]:
gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank()

In [None]:
gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank()

In [None]:
gauge_df_long["weights"] = (gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank(ascending=False) - 2).clip(0, 5)
gauge_df_long["weights"] = (gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank(ascending=False))
gauge_df_long.loc["Broome"]

In [None]:
def weighted_mean(df, values, weights, groupby):
    df = df.copy()
    grouped = df.groupby(groupby)
    df['weighted_average'] = df[values] / grouped[weights].transform('sum') * df[weights]
    return grouped['weighted_average'].sum(min_count=1) #min_count is required for Grouper objects

def weighted_median(df, val, weight):
    df_sorted = df.sort_values(val)
    cumsum = df_sorted[weight].cumsum()
    cutoff = df_sorted[weight].sum() / 2.
    return df_sorted[cumsum >= cutoff][val].iloc[0]

weighted_mean(gauge_df_long, values="ABSLMP tide gauge (metres RMS)", weights="weights", groupby="site_name") 
# gauge_df_long.groupby("site_name").apply(lambda x: weighted_median(x, val="ABSLMP tide gauge (metres RMS)", weight="weights"))


In [None]:
gauge_df_long.groupby("site_name").mean()["ABSLMP tide gauge (metres RMS)"]

In [None]:
weighted_df = gauge_df_long.groupby("site_name").first()
weighted_df["weights"] = gauge_df_long.groupby("site_name")[["X-TRACK altimetry (metres RMS)"]].rank()


#     [["tide_model", "X-TRACK altimetry (metres RMS)"]].rank()   #axis=0, numeric_only=True).loc["Stony Point"]

In [None]:
df1 = pd.DataFrame(index=("a", "a", "b", "b"), data={"test": [1, 2, 3, 4]})
df1

In [None]:
df2 = pd.DataFrame(index=("a", "b"), data={"test": [0, 2]})
df2

In [None]:
df1 * df2

In [None]:
gauge_df_long.groupby("site_name").mean()

### Pytides

In [None]:
from pytides.tide import Tide

demeaned = water_level - water_level.mean()
tide = Tide.decompose(demeaned, water_level.index)
(demeaned - tide.at(water_level.index)).plot(figsize=(13, 10))