### Generate the Official cb-WoFS Explainability Graphics 

In [1]:
# Box and Whisker plot for the top 5 predictors 
# pink line for a given example. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from display_names import to_display_name, to_units
import sys
sys.path.insert(0, '/home/monte.flora/python_packages/frdd-wofs-post')
sys.path.insert(0, '/home/monte.flora/python_packages/wofs_ml_severe')

from wofs_ml_severe import MLTrainer
from wofs_ml_severe.io.io import HailSizeLoader
from wofs.post.utils import load_yaml
import matplotlib.ticker as ticker

from sklearn.impute import SimpleImputer

  @jit(fastmath=True, parallel=False)
  @jit(fastmath=True,parallel=False)
Failed to add gpm alias to meters.
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


interpret-ml not installed
scikeras is not installed


2024-03-05 11:17:37.074687: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 11:17:37.100477: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 11:17:37.100507: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 11:17:37.101459: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-05 11:17:37.106819: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import math
import json 
import matplotlib.ticker as mtick

class cbWoFSExplainabilityGraphics:
    def __init__(self, X_train, y_train):
        
        # Only get where the examples are matched to reports or hail size
        # is greater than zero. 
        inds = np.where(y_train>0)[0]
    
        X_train_subset = X_train.iloc[inds, :]
        X_train_subset.reset_index(drop=True, inplace=True)
        
        #X_train_subset['mid_level_lapse_rate_ens_mean_spatial_mean'] /= -2.7
        #X_train_subset['low_level_lapse_rate_ens_mean_spatial_mean'] /= -3.0
        
        # Convert mid-level temps?  
        
        self.X_train = X_train_subset
        self.features = X_train_subset.columns 
        
        self.max_min_val_dict = { }
        
    def get_order_of_magnitude(self, value):
        return int(math.log10(abs(value)))
        
    def get_fontsize(self, value):
        """Get the fontsize based on order of magnitude."""
        try:
            oom = self.get_order_of_magnitude(value)
        except ValueError:
            return 10
            
        if oom <= -1:
            return 7
    
        elif oom > 2: 
            return 7
    
        else:
            return 10 
    
    def is_negatively_oriented(self, values):
        """Check if the value is negatively oriented (higher negative values are meaningful)"""
        # If the absolute value of the minimum is larger than the 
        # maximum value, then the values are negatively oriented. 
        if abs(np.min(values)) > np.max(values):
            return True
        else: 
            return False
    
    
    def _round(self, value, mode):

        def round_to_nearest_fifth(x):    
            if x < 0:
                return -round_to_nearest_fifth(-x)
            elif x < 10:
                return x
            else:
                if mode=='upper': 
                    return math.ceil(x / 5,) * 5
                else:
                    return math.floor(x / 5,) * 5
        
        if value == 0.0:
            return 0.0, 0
    
        # Find the order of magnitude (oom)
        oom = self.get_order_of_magnitude(value)
        
        #print(f'{oom=}')
        
        round_to_fifth = False
        if oom >= 0:
            # Positive Order of Mag. 
            if oom == 0:
                round_int = 2
            if oom == 1 :
                round_int = 1
                round_to_fifth = True
            elif oom > 1:
                round_int = 0
                round_to_fifth = True
        else:
            # Negative Order of Mag.
            if oom == -1:
                round_int = 3
            elif oom == -2:
                round_int = 4    
            else:
                round_int = 4
 
        # Round to the nearest 5 for >=10 
        if round_to_fifth:
            return round_to_nearest_fifth(round(value, round_int)), round_int
        else:
            return round(value, round_int), round_int

        
    def create_global(self, features, target):
        """Create the global explainability graphic"""
        f, axes = plt.subplots(dpi=192, nrows=5, 
                           figsize=(800/192, 800/192))
        for ax, feature in zip(axes, features):
            self.create_local(feature, ax=ax, f=f)
        
        title = f"""
        Training Set Distribution\n(All {target}-Producing Storms)\nfor the Top 5 Predictors (out of 113)"""
    
        f.suptitle(title, 
               fontsize=8, y=1.10)
        
        axes[0].set_title('Red numbers and vertical bars\nshow current values for this object', 
                      fontsize=6, pad=12, color='red')
        
        plt.subplots_adjust(hspace=1.4)
        
        return f, axes 
                
    def create_local(self, feature, target, f=None, ax=None):
        """Create box-and-whisker graphic for a single feature"""  
        units = to_units(feature)
        pretty_name = to_display_name(feature)
    
        if ax is None:
            f, ax = plt.subplots(dpi=192, nrows=1, 
                           figsize=(800/192, 100/192))
        
        # Despine and only leave the bottom side. 
        for side in ['top', 'right', 'left']: 
            ax.spines[side].set_visible(False)

        # Remove y tick labels 
        whis=[0.01, 99.9]
        ax.tick_params(axis='x', labelsize=9, size=8)
        
        # Check for negative values. 
        min_val, round_int_min = self._round(np.nanpercentile(self.X_train[feature], whis[0]), 'lower')
        max_val, round_int_max = self._round(np.nanpercentile(self.X_train[feature], whis[-1]), 'upper')
        
        round_int = round_int_min if self.is_negatively_oriented(self.X_train[feature]) else round_int_max
        
        self.max_min_val_dict[f'{feature}'] = {'round_int' : round_int}
        
        # Create the box-and-whiskers 
        box_plot = ax.boxplot(x=self.X_train[feature], vert=False, 
                              whis=whis, patch_artist=True, 
                              widths=0.3, showfliers=False )
        ax.set_yticks([],)
        
        # Create a title for the feature name. 
        ax.annotate(fr'{pretty_name} ({units})', xy=(0, 1.15),
                    xycoords='axes fraction', fontsize=6, ha='left', color = 'k', fontweight='bold')
            
        # Identify pretty tick ranges 
        ax.set_xlim(min_val, max_val)
        
        # set the tick locator for the x-axis
        #ax.xaxis.set_major_locator(ticker.MaxNLocator(nbins=7))
        
        rng = list(ax.get_xticks())
        
        target_ = TARGET_DICT.get(target, target)
        
        
        self.max_min_val_dict[f'{feature}_{target_}'] = {'max_val' : rng[-1],
                                          'min_val' : rng[0], }
        
        # Identify pretty tick ranges 
        ax.set_xlim(rng[0], rng[-1])
        
        
        if round_int in [0,1]:
            labels = [f"{int(round(v, round_int))}" for v in rng]
        else:
            labels = [f"{round(v, round_int)}" for v in rng]
        
        labels[0] = ''; labels[-1] = ''
        
        high_val = np.max((abs(max_val), abs(min_val)))
        #print(f'{rng=}  {labels=} {max_val=}  {min_val=}')
        
        ax.set_xticklabels(labels=labels, fontsize=self.get_fontsize(high_val))

        # fill with colors
        color = 'xkcd:medium blue'
        for patch in box_plot['boxes']:
            patch.set_facecolor(color)
        for line in box_plot['medians']:
            line.set_color('k')

        return f, ax
    
    def save_local(self, fig, feature, target):
        
        target_ = TARGET_DICT.get(target, target)
        
        plt.savefig(
            f"new_graphics_2024/{feature}_{target_}_explainability_background.png", 
            format="png", dpi=192, bbox_inches="tight", pad_inches=0.0)
        plt.close(fig) 
    
    #def save_global(self, fig, target):    
    #    plt.savefig(
    #        f"new_graphics/{target}_global_explainability_background.png", 
    #        format="png", dpi=192, bbox_inches="tight", pad_inches=0.0)
    #    plt.close(fig) 
    
    def save_json(self, target): 
        
        target_ = TARGET_DICT.get(target, target)
        
        with open(f"../json/min_max_vals_{target_}.json", "w") as outfile:
            json.dump(self.max_min_val_dict, outfile)

### Test a Single Individual Panel

### Create Individual Panels for All Features Per Hazard

In [4]:
targets = [#'severe_mesh', 
           #'severe_wind', 
           #'severe_torn', 
          #['severe_mesh', 'severe_wind', 'severe_torn'],
           #'any_sig_severe', 
           'hail_size'
          ]

TARGET_DICT = { 'severe_mesh' : 'hail',
                'severe_wind' : 'wind',
                'severe_torn' : 'tornado', 
                'any_sig_severe' : 'all_sig_severe', 
               'hail_size' : 'hail_size'
              }

def get_target_str(target):
    if isinstance(target, list):
        return 'all_severe'
    else:
        return TARGET_DICT[target]

for target in targets:
    if target == 'hail_size': 
        time = 'first_hour'
        loader = HailSizeLoader(time)
        X_train, _, y_train, _ = loader.load('training')
    else:
        loader_kws= {
              'data_path' :  '/work/mflora/ML_DATA/DATA/',
              'return_full_dataframe': False, 
              'random_state' : 123, 
               'mode' : 'testing',
               'exclude_missing_mesh': True
        }
        time = ['first_hour', 'second_hour', 'third_hour', 'fourth_hour'] 

        loader = MLTrainer(loader_kws=loader_kws) 
        X_train, y_train, metadata_train = loader.load_data('Model', target, time)
             
    # Impute missing values. 
    X_train = pd.DataFrame(SimpleImputer().fit_transform(X_train), columns=X_train.columns)
        
    explainer = cbWoFSExplainabilityGraphics(X_train.astype(float), y_train)
    
    target_str = get_target_str(target)
    
    for feature in X_train.columns: 
        fig, _ = explainer.create_local(feature, target_str)
        explainer.save_local(fig, feature, target_str)
        
    explainer.save_json(target_str)    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Num of Train Dates: 63...Num. of Val. Dates: 22
4.406582575594905 4.5752950402121835
np.any(np.isnan(X_val))=False
np.any(np.isnan(y_val))=False
np.any(np.isinf(X_val))=False
np.any(np.isinf(y_val))=False


In [None]:
# Save all the jsons together! 
from glob import glob
files = glob("../json/min_max_vals*")
pd.read_json(files[0])

In [None]:
top_preds = {'tornado_severe_0km' : 
            ['wz_0to2_instant__time_max__amp_ens_mean_spatial_perc_90', 
             'shear_v_0to6__ens_mean__spatial_mean',
             'buoyancy__time_min__amp_ens_mean_spatial_perc_10',
             '10-500m_bulkshear__time_max__amp_ens_mean_spatial_perc_90',
             'v_10__ens_mean__spatial_mean',
            ],
            'hail_severe_0km' : 
                    [
              'dbz_3to5__time_max__ens_mean__spatial_mean',
              'comp_dz__time_max__amp_ens_mean_spatial_perc_90',  
              'td_850__ens_mean__spatial_mean',
              '10-500m_bulkshear__time_max__ens_mean__spatial_mean',
              'w_down__time_min__amp_ens_mean_spatial_perc_10'
    
            ],
            'wind_severe_0km' : 
                    [
                  'v_10__ens_mean__spatial_mean',
                  'ws_80__time_max__amp_ens_mean_spatial_perc_90',
                  'comp_dz__time_max__amp_ens_mean_spatial_perc_90',
                  'div_10m__time_min__ens_mean__spatial_mean',
                  'buoyancy__time_min__amp_ens_mean_spatial_perc_10',  
                    ],
             
             'all_severe' : 
             [
                'comp_dz__time_max__amp_ens_mean_spatial_perc_90',
                'div_10m__time_min__ens_std__spatial_mean',
                'ctt__time_min__amp_ens_mean_spatial_perc_10',
                'hailcast__time_max__ens_mean__spatial_mean',
                '10-500m_bulkshear__time_max__ens_mean__spatial_mean',
                ],
             'all_sig_severe': [
                'low_level_lapse_rate__ens_mean__spatial_mean',
                'ctt__time_min__amp_ens_mean_spatial_perc_10',
                '10-500m_bulkshear__time_max__ens_mean__spatial_mean',
                'hailcast__time_max__ens_mean__spatial_mean',
                'comp_dz__time_max__amp_ens_mean_spatial_perc_90', 
                ],
             
            }

In [None]:
X_train, y_train, metadata = load_ml_data('wind_severe_0km', 
                 lead_time = 'first_hour', 
                 mode = None, 
                 baseline=False,
                 return_only_df=False, 
                 load_reduced=True, 
                 base_path = '/work/mflora/ML_DATA/DATA',
                )

In [None]:
top_features = top_preds['wind_severe_0km']

dataframe = X_train.iloc[:10, :]

n_examples = len(dataframe)

top_values = dataframe[top_features].values 
top_features_list = [list(top_features) for _ in range(n_examples)]

val_df = pd.DataFrame(top_values, columns=[f'Feature Val {i+1}' for i in range(5)])
feature_df = pd.DataFrame(top_features_list, columns=[f'Feature Name {i+1}' for i in range(5)])
    
total_df = pd.concat([val_df, feature_df], axis=1)


In [None]:
top_values.shape, n_examples

In [None]:
total_df

### Create the top 5 predictor global panel 