# Collect sup3r Skill Outputs and Aggregate Per Region

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from glob import glob
import json
import geopandas as gpd
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt
import xarray as xr

from rex import Resource, init_logger
from region_classifier import RegionClassifier

from sup3r.preprocessing.data_handling.base import DataHandler
from sup3r.preprocessing.data_handling import DataHandlerNCforCC
from sup3r.preprocessing.data_handling import DataHandlerNCforCCwithPowerLaw
from sup3r.bias.bias_calc import SkillAssessment

from make_projection_summaries_cmip import (get_countries_shape, get_states_shape, get_eez_shape, get_fps, get_targets_shapes, 
                                            make_summary_files, FEATURES, MODELS, TAGS, REGIONS)

DataHandlerNCforCC.CHUNKS = {'time': None, 'lat': None, 'lon': None}

In [None]:
logger = logging.getLogger(__name__)
init_logger(__name__)
init_logger('make_projection_summaries')
init_logger('sup3r', log_level='DEBUG')

In [None]:
countries, countries_col = get_countries_shape()
states, states_col = get_states_shape()
eez, eez_col = get_eez_shape()

In [None]:
with open('./file_paths.json', 'r') as f:
    file_paths = json.load(f)

In [None]:
fp_base = '/projects/alcaps/gcm_eval/{}_historical*/*.h5'

In [None]:
def add_data_to_df(i, df, data, mask, base_feature, bias_feature, model, tag, rname):
    
    df.at[i, 'gcm'] = model
    df.at[i, 'tag'] = tag
    df.at[i, 'region'] = rname
    df.at[i, 'feature'] = bias_feature

    df.at[i, 'hist_mean'] = data[f'base_{base_feature}_mean'][mask].mean()
    df.at[i, 'hist_p1'] = data[f'base_{base_feature}_percentile_1'][mask].mean()
    df.at[i, 'hist_p5'] = data[f'base_{base_feature}_percentile_5'][mask].mean()
    df.at[i, 'hist_p50'] = data[f'base_{base_feature}_percentile_50'][mask].mean()
    df.at[i, 'hist_p95'] = data[f'base_{base_feature}_percentile_95'][mask].mean()
    df.at[i, 'hist_p99'] = data[f'base_{base_feature}_percentile_99'][mask].mean()
    df.at[i, 'gcm_mean'] = data[f'bias_{bias_feature}_mean'][mask].mean()
    df.at[i, 'gcm_p1'] = data[f'bias_{bias_feature}_percentile_1'][mask].mean()
    df.at[i, 'gcm_p5'] = data[f'bias_{bias_feature}_percentile_5'][mask].mean()
    df.at[i, 'gcm_p50'] = data[f'bias_{bias_feature}_percentile_50'][mask].mean()
    df.at[i, 'gcm_p95'] = data[f'bias_{bias_feature}_percentile_95'][mask].mean()
    df.at[i, 'gcm_p99'] = data[f'bias_{bias_feature}_percentile_99'][mask].mean()
    
    df.at[i, 'ks_stat'] = data[f'{bias_feature}_ks_stat'][mask].mean()
    df.at[i, 'bias_mean'] = (data[f'bias_{bias_feature}_mean'][mask] - data[f'base_{base_feature}_mean'][mask]).mean()
    df.at[i, 'bias_p1'] = (data[f'bias_{bias_feature}_percentile_1'][mask] - data[f'base_{base_feature}_percentile_1'][mask]).mean()
    df.at[i, 'bias_p5'] = (data[f'bias_{bias_feature}_percentile_5'][mask] - data[f'base_{base_feature}_percentile_5'][mask]).mean()
    df.at[i, 'bias_p50'] = (data[f'bias_{bias_feature}_percentile_50'][mask] - data[f'base_{base_feature}_percentile_50'][mask]).mean()
    df.at[i, 'bias_p95'] = (data[f'bias_{bias_feature}_percentile_95'][mask] - data[f'base_{base_feature}_percentile_95'][mask]).mean()
    df.at[i, 'bias_p99'] = (data[f'bias_{bias_feature}_percentile_99'][mask] - data[f'base_{base_feature}_percentile_99'][mask]).mean()

    df.at[i, 'percent_bias_mean'] = 100 * df.at[i, 'bias_mean'] / df.at[i, 'hist_mean']
    df.at[i, 'percent_bias_p1']   = 100 * df.at[i, 'bias_p1'] / df.at[i, 'hist_mean']
    df.at[i, 'percent_bias_p5']   = 100 * df.at[i, 'bias_p5'] / df.at[i, 'hist_mean']
    df.at[i, 'percent_bias_p50']  = 100 * df.at[i, 'bias_p50'] / df.at[i, 'hist_mean']
    df.at[i, 'percent_bias_p95']  = 100 * df.at[i, 'bias_p95'] / df.at[i, 'hist_mean']
    df.at[i, 'percent_bias_p99']  = 100 * df.at[i, 'bias_p99'] / df.at[i, 'hist_mean']

    return df

In [None]:
i = 0
df = pd.DataFrame()

dset_tags = ['ks_stat', '']

for model, tag in zip(MODELS, TAGS):
    print(model)
    fps = glob(fp_base.format(tag))
    
    for fp in fps:
        with Resource(fp) as res:
            dsets = sorted(res.dsets)
            base_feature = [d for d in dsets if d.startswith('base_')][0]
            bias_feature = [d for d in dsets if d.startswith('bias_')][0]
            base_feature = base_feature.replace('base_', '').replace('_kurtosis', '')
            bias_feature = bias_feature.replace('bias_', '').replace('_kurtosis', '')
            
            data = {dset: res[dset] for dset in dsets}
            lat = data['latitude']
            lon = data['longitude']
            meta = pd.DataFrame({'latitude': lat.flatten(), 'longitude': lon.flatten()})
        
        meta = RegionClassifier(meta, countries, countries_col).classify()
        meta = RegionClassifier(meta, states, states_col).classify()
        meta = RegionClassifier(meta, eez, eez_col).classify()
        meta['atlantic'] = (meta[countries_col] == '-999') & (meta[eez_col] != '-999') & (meta['longitude'] < -105)
        meta['gulf'] = (meta[countries_col] == '-999') & (meta[eez_col] != '-999') & (meta['longitude'] > -105) & (meta['longitude'] < -81)
        meta['pacific'] = (meta[countries_col] == '-999') & (meta[eez_col] != '-999') & (meta['longitude'] > -81)
    
        for rname, rstates in REGIONS.items():
            mask = np.isin(meta[states_col].values.reshape(lat.shape), rstates)
            df = add_data_to_df(i, df, data, mask, base_feature, bias_feature, model, tag, rname)
            i += 1

        for rname in ['atlantic', 'pacific', 'gulf']:
            mask = meta[rname].values.reshape(lat.shape)
            df = add_data_to_df(i, df, data, mask, base_feature, bias_feature, model, tag, rname)
            i += 1

In [None]:
df['feature'].unique()

In [None]:
df[df['feature'] == 'temperature_max_2m']

In [None]:
df.to_csv('./skill_summaries/skill_summary_all.csv')

In [None]:
index = [
 'pr',
 'relativehumidity_2m',
 'relativehumidity_max_2m',
 'relativehumidity_min_2m',
 'rsds',
 'temperature_2m',
 'temperature_max_2m',
 'temperature_min_2m',
 'windspeed_10m',
 'windspeed_100m',
 'windspeed_200m',
]

for region in df['region'].unique():
    rstr = region.lower().replace(' ', '_')
    mask = df['region'] == region
    mask &= df['feature'].isin(index)

    for metric in [col for col in df.columns if col not in ('gcm', 'tag', 'region', 'feature')]:
        dfp = df[mask].pivot(index='gcm', columns='feature', values=metric).T.loc[index]
        dfp.to_csv(f'./skill_summaries/skill_summary_{rstr}_{metric}.csv')