# User Input

In [None]:
# Directory to zonal_stats files run from `submit_zonal_stats.sh`
directory_canada = "/home/khb47/ABoVE/scripts/zonal_stats/zonal_stats_Canada_*.txt"
directory_epa2_above = "/home/khb47/ABoVE/scripts/zonal_stats/zonal_stats_EPA2_*ABoVE*.txt"
directory_epa2_canada = "/home/khb47/ABoVE/scripts/zonal_stats/zonal_stats_EPA2_*Canada*.txt"

# Stats one wants to visualize:
visualize = 'EPA2' # Can be Canada or EPA2

# Code used to get correct ecoregions

# Set up
Import the libraries, load the shapefiles and create tables for the zonal statistics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib import colors
import glob
import os

In [None]:
# Load the shapefiles
epa_shapefile = "/projects/arctic/share/ABoVE_Biomass/OtherSpatialDatasets/EPA_ecoregion_lvl2_102001.shp"
canada_shapefile = "/projects/arctic/share/ABoVE_Biomass/OtherSpatialDatasets/CanadaAlaska_Boundaries_102001.shp"
epa_gdf = gpd.read_file(epa_shapefile)
canada_gdf = gpd.read_file(canada_shapefile)

In [None]:
# Make the Canada zonal stats data
canada_alaska = pd.DataFrame()
for file in glob.glob(directory_canada):
    base_name = os.path.basename(file) 
    dataset_name = base_name[len("zonal_stats_Canada_"):-len(".txt")]
    dataset_canada_alaska = pd.read_csv(file, delimiter=', ', engine='python')
    dataset_canada_alaska['Dataset'] = dataset_name.split('_')[0]
    canada_alaska = pd.concat([canada_alaska, dataset_canada_alaska], ignore_index=True)
canada_alaska.head()

In [None]:
# Make the Above EPA2 zonal stats data
above_epa2 = pd.DataFrame()
for file in glob.glob(directory_epa2_above):
    base_name = os.path.basename(file) 
    dataset_name = base_name[len("zonal_stats_EPA2_"):-len(".txt")]
    dataset_above_epa2 = pd.read_csv(file, delimiter=', ', engine='python')
    dataset_above_epa2['Dataset'] = dataset_name.split('_')[0]
    above_epa2 = pd.concat([above_epa2, dataset_above_epa2], ignore_index=True)
above_epa2 = above_epa2.dropna(subset=['Mean'])

# Make the Canada EPA2 zonal stats data
canada_epa2 = pd.DataFrame()
for file in glob.glob(directory_epa2_canada):
    base_name = os.path.basename(file) 
    dataset_name = base_name[len("zonal_stats_EPA2_"):-len(".txt")]
    dataset_canada_epa2 = pd.read_csv(file, delimiter=', ', engine='python')
    dataset_canada_epa2['Dataset'] = dataset_name.split('_')[0]
    canada_epa2 = pd.concat([canada_epa2, dataset_canada_epa2], ignore_index=True)
canada_epa2 = canada_epa2.dropna(subset=['Mean'])

# Make the combined epa2 dataframe
combined_epa2 = pd.concat([canada_epa2, above_epa2], ignore_index=True)
epa2 = combined_epa2.loc[combined_epa2.groupby(['Zone', 'Dataset'])['Coverage'].idxmax()]
epa2.head()

# Visualization of Zonal Stats
In the cell below you can change which type of file you are looking at, either Canada, or EPA2

In [None]:
# Change these depending on the region you want
if visualize == 'Canada':
    df = canada_alaska.copy()
    excluded_zones = ['AK', 'NT', 'NU']
    df = df[~df['Zone'].isin(excluded_zones)]
    shapefile = canada_gdf.copy()
    key = 'postal'
elif visualize == 'EPA2':
    df = epa2.copy()
    shapefile = epa_gdf.copy()
    key = 'NA_L2KEY'

## Bar chart of mean values across a dataset

In [None]:
# Set a consistent order and color palette
dataset_order = ['Duncanson2023', 'Guindon2023', 'Soto-Navarro2020', 'SpawnGibbs2020', 'Xu2021']  # Adjust to match actual dataset names
palette = sns.color_palette('mako', n_colors=len(dataset_order))

df['Dataset'] = pd.Categorical(df['Dataset'], categories=dataset_order, ordered=True)

# Group by Zone and Dataset, then plot the mean values
grouped_mean = df.groupby(['Zone', 'Dataset'], observed=False)['Mean'].mean().unstack()

In [None]:
# Convert 'Mean' to numeric
df['Mean'] = pd.to_numeric(df['Mean'], errors='coerce')

# Group by Zone and Dataset, then plot the mean values
grouped_mean.plot(kind='bar', figsize=(12, 6), color=palette)
plt.title('Mean Biomass Across Datasets by Zone')
plt.xlabel('Zone')
plt.ylabel('Mean Biomass')
plt.legend(title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Boxplot of Distribution by dataset

In [None]:
sns.boxplot(x='Dataset', y='Mean', data=df, order=dataset_order, palette=palette)
plt.title("Distribution of Mean Biomass by Dataset - Canada")
plt.ylabel("Mean Biomass")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Choropleth Maps

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd

# Merge DataFrame with GeoDataFrame
df_geo = shapefile.merge(df, left_on=key, right_on='Zone')
df_geo[['Mean', 'Median', 'Sum', 'Std']] = df_geo[['Mean', 'Median', 'Sum', 'Std']].round(2)

# Create a grid of plots (2 rows, adjustable columns)
datasets = df['Dataset'].unique()
n_cols = 2
n_rows = (len(datasets) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 10), constrained_layout=True)
axes = axes.flatten()

for i, dataset in enumerate(datasets):
    df_subset = df_geo[df_geo['Dataset'] == dataset]
    
    # Individual scale for each plot
    vmin = df_subset['Mean'].min()
    vmax = df_subset['Mean'].max()
    
    ax = df_subset.plot(
        column='Mean',
        cmap='viridis',
        legend=True,
        ax=axes[i],
        vmin=vmin,
        vmax=vmax
    )
    
    ax.set_title(f'Mean Biomass Values ({dataset})', fontsize=12)
    ax.set_axis_off()

    # Add labels with ± for standard deviation
    for x, y, mean in zip(df_subset.geometry.centroid.x, 
                                df_subset.geometry.centroid.y, 
                                df_subset['Mean'].round(2)):
        if not pd.isnull(mean):
            label = f'{mean}' 
            ax.annotate(
                text=label, 
                xy=(x, y), 
                ha='center', 
                fontsize=8, 
                color='black',
                bbox=dict(boxstyle="round,pad=0.3", edgecolor='none', facecolor='white', alpha=0.6)
            )

# Remove any empty subplots (if the number of datasets < grid size)
for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axes[j])

plt.show()


## Summary Table using a Heatmap

In [None]:
summary_table = df.groupby(['Zone', 'Dataset']).agg(
    Mean=('Mean', 'mean'),
    Median=('Median', 'mean'),
    Sum=('Sum', 'sum'),
    Std=('Std', 'mean')
)

# Could print out summary table
#print(summary_table)

# Visualize using a heatmap
summary_pivot = summary_table.reset_index().pivot(index='Zone', columns='Dataset', values='Mean')
sns.heatmap(summary_pivot, cmap='coolwarm', annot=True, fmt=".2f", cbar_kws={'label': 'Mean Biomass Value'})
plt.title('Heatmap of Mean Biomass Values by Zone and Dataset')
plt.show()
