# Subsampling Comparisons

In [79]:
import xarray as xr
import rioxarray as rxr
import gval
from gval import CatStats
import pandas as pd
import numpy as np
import dask
import geopandas as gpd
from gval.utils.schemas import SubsamplingDf, Sample_identifiers, Subsample_identifiers
from geocube.api.core import make_geocube
import flox
from flox.xarray import xarray_reduce
from shapely.geometry import Point
from typing import Union
from gval.comparison.pairing_functions import difference
import dask as da
import time
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=9, tm_mday=20, tm_hour=12, tm_min=5, tm_sec=8, tm_wday=2, tm_yday=263, tm_isdst=1)


## Create Subsampling DataFrame

Let's open up a geopackage with polygons to use for subsampling:

In [80]:
data_path = '../data/data'
polygons_continuous = gpd.read_file(f'{data_path}/subsample_continuous_polygons.gpkg')
polygons_continuous

Unnamed: 0,geometry
0,"POLYGON ((-97.72375 29.56328, -97.72304 29.558..."
1,"POLYGON ((-97.71604 29.55635, -97.71587 29.551..."


To use this DataFrame as a subsampling DataFrame let's use `create_subsampling_df`:

In [81]:
polygons_continuous.gval.create_subsampling_df(subsampling_type=["include", "include"], inplace=True)
polygons_continuous

Unnamed: 0,geometry,subsample_type,subsample_id
0,"POLYGON ((-97.72375 29.56328, -97.72304 29.558...",include,1
1,"POLYGON ((-97.71604 29.55635, -97.71587 29.551...",include,2


The DataFrame above has a geometry column, a subsample type with the value of "include" (calculating data within the geometry) or "exclude" (remove all data contained within the geometry), and subsample_id.

There is also the ability to add subsampling_weights:

In [82]:
polygons_continuous = polygons_continuous.gval.create_subsampling_df(subsampling_type=["exclude", "exclude"], subsampling_weights=[2, 1])
polygons_continuous

Unnamed: 0,geometry,subsample_type,weights,subsample_id
0,"POLYGON ((-97.72375 29.56328, -97.72304 29.558...",exclude,2,1
1,"POLYGON ((-97.71604 29.55635, -97.71587 29.551...",exclude,1,2


## Continuous Compare Subsampling

In [5]:
cds = rxr.open_rasterio(f'{data_path}/candidate_continuous_1.tif', band_as_variable=True, mask_and_scale=True)
bds = rxr.open_rasterio(f'{data_path}/benchmark_continuous_1.tif', band_as_variable=True, mask_and_scale=True)

Let's use this newly created subsampling df on a continuous comparison.  For each subsample an agreement map is created and then used to calculate continuous statistics. There are four subsampling-average types:

1. <b>full-detail</b>: reports all metrics calculated on separate bands and subsamples.
2. <b>band</b>: reports all metrics on subsamples with band values averaged.
3. <b>subsample</b>: reports all metrics on bands with subsample values averaged.
4. <b>weighted</b>: reports all metrics on bands with subsample values averaged and scaled by weights.

#### Full-Detail

In [83]:
ag, met = cds.gval.continuous_compare(benchmark_map=bds,
                                      metrics=["mean_percentage_error"],
                                     subsampling_df=polygons_continuous,
                                     subsampling_average="full-detail")
met

Unnamed: 0,subsample,band,mean_percentage_error
0,1,1,0.125928
1,1,2,-0.111844
2,2,1,0.167116
3,2,2,-0.143187


#### Band

In [84]:
ag, met = cds.gval.continuous_compare(benchmark_map=bds,
                                      metrics=["mean_percentage_error"],
                                     subsampling_df=polygons_continuous,
                                     subsampling_average="band")
met

Unnamed: 0,subsample,band,mean_percentage_error
0,1,averaged,0.007042
1,2,averaged,0.011964


#### Subsample

In [85]:
ag, met = cds.gval.continuous_compare(benchmark_map=bds,
                                      metrics=["mean_percentage_error"],
                                     subsampling_df=polygons_continuous,
                                     subsampling_average="subsample")
met

Unnamed: 0,subsample,band,mean_percentage_error
0,averaged,1,0.146522
1,averaged,2,-0.127515


#### Weighted

In [86]:
ag, met = cds.gval.continuous_compare(benchmark_map=bds,
                                      metrics=["mean_percentage_error"],
                                      subsampling_df=polygons_continuous,
                                      subsampling_average="weighted")
met

Unnamed: 0,subsample,band,mean_percentage_error
0,averaged,1,0.083952
1,averaged,2,-0.037281


## Categorical

In [87]:
# Subsampling DF
polygons_categorical = gpd.read_file(f'{data_path}/subsample_two-class_polygons.gpkg')
polygons_categorical.gval.create_subsampling_df(subsampling_type=["exclude", "exclude"], inplace=True)

# Candidate and Benchmark
cda = rxr.open_rasterio(f'{data_path}/candidate_map_multiband_two_class_categorical.tif', mask_and_scale=True)
bda = rxr.open_rasterio(f'{data_path}/benchmark_map_multiband_two_class_categorical.tif', mask_and_scale=True)

Just as done earlier in continuous comparison, the following performs subsampling on categorical comparisons..  For each subsample an agreement map, a cross-tabulation table, and a metric table is created. There are three subsampling-average types:

1. <b>full-detail</b>: reports all metrics calculated on separate bands and subsamples.
2. <b>band</b>: reports all metrics on subsamples with band values averaged.
3. <b>subsample</b>: reports all metrics on bands with subsample values averaged.

#### Full-detail

In [88]:
ag, ctab, met = cda.gval.categorical_compare(benchmark_map=bda,
                                             metrics="all",
                                             positive_categories=[2],
                                             negative_categories=[0, 1],
                                             subsampling_df=polygons_categorical,
                                             subsampling_average="full-detail")
met

Unnamed: 0,band,subsample,fn,fp,tn,tp,accuracy,balanced_accuracy,critical_success_index,equitable_threat_score,f_score,false_discovery_rate,false_negative_rate,false_omission_rate,false_positive_rate,fowlkes_mallows_index,matthews_correlation_coefficient,negative_likelihood_ratio,negative_predictive_value,overall_bias,positive_likelihood_ratio,positive_predictive_value,prevalence,prevalence_threshold,true_negative_rate,true_positive_rate
0,1,1,1353554.0,4665316.0,4669177.0,1355189.0,0.500228,0.500254,0.183778,0.000177,0.310494,0.774904,0.499698,0.224741,0.499793,0.335583,0.000425,0.998983,0.775259,2.222619,1.001018,0.225096,0.224918,0.499873,0.500207,0.500302
1,1,2,1373118.0,5029027.0,5033101.0,1375824.0,0.500265,0.500347,0.176887,0.000234,0.300602,0.78519,0.499508,0.214341,0.499798,0.327888,0.00057,0.998611,0.785659,2.329933,1.00139,0.21481,0.214576,0.499826,0.500202,0.500492
2,2,1,570988.0,468631.0,8865862.0,2137755.0,0.913676,0.869501,0.672806,0.598781,0.804404,0.179801,0.210794,0.060506,0.050204,0.804553,0.749276,0.221937,0.939494,0.962212,15.719903,0.820199,0.224918,0.201417,0.949796,0.789206
3,2,2,580589.0,446310.0,9615818.0,2168353.0,0.919843,0.87222,0.678617,0.610168,0.808543,0.170695,0.211205,0.056941,0.044355,0.808797,0.758274,0.221007,0.943059,0.951152,17.783516,0.829305,0.214576,0.191679,0.955645,0.788795


#### Band

In [89]:
ag, ctab, met = cda.gval.categorical_compare(benchmark_map=bda,
                                             metrics="all",
                                             positive_categories=[2],
                                             negative_categories=[0, 1],
                                             subsampling_df=polygons_categorical,
                                             subsampling_average="band")
met

Unnamed: 0,subsample,band,fn,fp,tn,tp,accuracy,balanced_accuracy,critical_success_index,equitable_threat_score,f_score,false_discovery_rate,false_negative_rate,false_omission_rate,false_positive_rate,fowlkes_mallows_index,matthews_correlation_coefficient,negative_likelihood_ratio,negative_predictive_value,overall_bias,positive_likelihood_ratio,positive_predictive_value,prevalence,prevalence_threshold,true_negative_rate,true_positive_rate
0,1,averaged,1924542.0,5133947.0,13535039.0,3492944.0,0.706952,0.684877,0.33104,0.180302,0.497415,0.59511,0.355246,0.124489,0.274999,0.510935,0.321994,0.489994,0.875511,1.592416,2.34457,0.40489,0.224918,0.39507,0.725001,0.644754
1,2,averaged,1953707.0,5475337.0,14648919.0,3544177.0,0.710054,0.686284,0.322984,0.178008,0.488266,0.607055,0.355356,0.117675,0.272076,0.503299,0.320245,0.488178,0.882325,1.640543,2.369348,0.392945,0.214576,0.393814,0.727924,0.644644


#### Subsample

In [90]:
ag, ctab, met = cda.gval.categorical_compare(benchmark_map=bda,
                                             metrics="all",
                                             positive_categories=[2],
                                             negative_categories=[0, 1],
                                             subsampling_df=polygons_categorical,
                                             subsampling_average="subsample")
met

Unnamed: 0,subsample,band,fn,fp,tn,tp,accuracy,balanced_accuracy,critical_success_index,equitable_threat_score,f_score,false_discovery_rate,false_negative_rate,false_omission_rate,false_positive_rate,fowlkes_mallows_index,matthews_correlation_coefficient,negative_likelihood_ratio,negative_predictive_value,overall_bias,positive_likelihood_ratio,positive_predictive_value,prevalence,prevalence_threshold,true_negative_rate,true_positive_rate
0,averaged,1,2726672.0,9694343.0,9702278.0,2731013.0,0.500247,0.500301,0.180241,0.000206,0.30543,0.780206,0.499602,0.219381,0.499795,0.331639,0.000499,0.998796,0.780619,2.276672,1.001205,0.219794,0.219587,0.499849,0.500205,0.500398
1,averaged,2,1151577.0,914941.0,18481680.0,4306108.0,0.916855,0.870914,0.67572,0.604581,0.806483,0.175241,0.211001,0.058654,0.04717,0.806681,0.753869,0.221447,0.941346,0.956642,16.726668,0.824759,0.219587,0.19647,0.95283,0.788999


In [91]:
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=9, tm_mday=20, tm_hour=12, tm_min=8, tm_sec=7, tm_wday=2, tm_yday=263, tm_isdst=1)
