In [1]:
import sys
sys.path.append('/home/monte.flora/python_packages/scikit-explain/')
sys.path.append('/home/monte.flora/python_packages/master/ml_workflow')
sys.path.append('/work/mflora/ROAD_SURFACE')
import skexplain
from os.path import join
import pickle
import seaborn as sns
import numpy as np
import pandas as pd
from glob import glob
import joblib
import matplotlib.pyplot as plt
from display_names import to_readable_names, get_units, to_color

from probsr_config import PREDICTOR_COLUMNS, FIGURE_MAPPINGS, COLOR_DICT
from skexplain.common.importance_utils import to_skexplain_importance

In [2]:
import xarray as xr
def to_xarray(shap_data, estimator_name, feature_names=None):
    dataset={}
    
    shap_values = shap_data['shap_values']
    bias = shap_data['bias']
    
    dataset[f'shap_values__{estimator_name}'] = (['n_examples', 'n_features'], shap_values)
    dataset[f'bias__{estimator_name}'] = (['n_examples'], bias.astype(np.float64))
    dataset['X'] = (['n_examples', 'n_features'], shap_data['X'])
    dataset['y'] = (['n_examples'], shap_data['targets'])
    
    ds = xr.Dataset(dataset)
    #ds.attrs['features'] = feature_names
    
    return ds 

In [3]:
hazard = 'tornado'

# Load the WoFS data. 
base_path = '/work/mflora/ML_DATA/DATA'
data_path = join(base_path, f'original_first_hour_training_matched_to_{hazard}_0km_data.feather')
df = pd.read_feather(data_path)


# Load the WoFS-ML-Severe Models
base_path = '/work/mflora/ML_DATA/MODEL_SAVES'
model_name = 'LogisticRegression'
model_paths = glob(join(base_path, f'{model_name}_first_hour_{hazard}*'))
model_path = [m for m in model_paths if 'manual' not in m][0]
model_data = joblib.load(model_path)

model = model_data['model']
feature_names = model_data['features']

X = df[feature_names].astype(float)
y = df[f'matched_to_{hazard}_0km'].astype(float)


all_features = df.columns
display_feature_names = {f : to_readable_names(f) for f in all_features}
feature_colors = {f : to_color(f) for f in all_features}

feature_colors = {**feature_colors, **COLOR_DICT}
display_feature_names = {**display_feature_names, **FIGURE_MAPPINGS}


In [4]:
explainer = skexplain.ExplainToolkit(X=X,y=y)

In [5]:
# backward singlepass, forward multipass, coefs/gini, SHAP 
base_path = '/work/mflora/ML_DATA/'

methods = []
results = [] 
names = []

for hazard in ['tornado', 'severe_hail', 'severe_wind', 'road_surface']:
    if hazard == 'road_surface':
        name = 'Random Forest'
    else:
        name = 'LogisticRegression'
    
    # permutation results
    if hazard == 'road_surface': 
        basePath = '/work/mflora/ROAD_SURFACE'
        bsp_fname = join(basePath,'permutation_importance', f'perm_imp_original_backward.nc')
        fmp_fname = join(basePath,'permutation_importance', f'perm_imp_original_forward.nc')
    else:    
        path = join(base_path, 'permutation_importance')
        bsp_fname = join(path, f'permutation_importance_{hazard}_first_hour_training_norm_aupdcbackward.nc' )
        fmp_fname = join(path, f'permutation_importance_{hazard}_first_hour_training_norm_aupdcforward.nc' )   

    bsp = explainer.load(bsp_fname)
    fmp = explainer.load(fmp_fname)
    
    
    
    
    

    # Backward singlepass and forward multipass: original_score - permuted score
    original_score = bsp[f'original_score__{name}'].values
    scores = original_score - bsp[f'singlepass_scores__{name}'].values
    bsp[f'singlepass_scores__{name}'] = (['n_vars_singlepass', 'n_permute'], scores)
    
    original_score = fmp[f'original_score__{name}'].values
    scores = original_score - fmp[f'multipass_scores__{name}'].values
    fmp[f'multipass_scores__{name}'] = (['n_vars_multipass', 'n_permute'], scores)

    if hazard == 'road_surface':
        methods.extend(['singlepass', 'multipass', 'gini', 'shap_sum'])
        # load the random forest
        rf = joblib.load(join(basePath, 'JTTI_ProbSR_RandomForest.pkl'))
        gini_values = rf.feature_importances_
        gini_rank = to_skexplain_importance(gini_values,
                                       estimator_name='Random Forest', 
                                       feature_names=PREDICTOR_COLUMNS, 
                                         method = 'gini')
        
    else:
        methods.extend(['singlepass', 'multipass', 'coefs', 'shap_sum'])
        coefs = model.base_estimator.named_steps['model'].coef_[0]
        coef_rank = to_skexplain_importance(coefs,
                                       estimator_name=name, 
                                       feature_names=X.columns, 
                                        method = 'coefs')

    # shap results
    if hazard == 'road_surface':
        fname = join(basePath,'shap_results', 'shap_rf_original.nc')
    else:
        fname = join(base_path, 'SHAP_VALUES', f'shap_values_LogisticRegression_{hazard}_first_hour.pkl')
    
    with open(fname, 'rb') as f:
        shap_data = pickle.load(f)
        shap_vals = shap_data['shap_values']
    
    shap_rank = to_skexplain_importance(shap_vals, 
                                      estimator_name=name, 
                                      feature_names=X.columns, 
                                      method ='shap_sum', )

    if hazard == 'road_surface':
        results.extend([bsp, fmp, gini_rank, shap_rank])
    else:
        results.extend([bsp, fmp, coef_rank, shap_rank])
    
    names.extend([name]*4)
        

In [6]:
data=[]
for ds,m, name in zip(results, methods, names):
    imp = ds[f'{m}_scores__{name}'].values
    imp[imp==0] = 0.000001
    imp_norm = imp/ (np.percentile(imp, 99) - np.percentile(imp, 1))
    ds[f'{m}_scores__{name}'] = ([f'n_vars_{m}', 'n_permute'], imp_norm)
    data.append(ds)

In [7]:
%%script false --no-raise-error
xlabels = ['Backward\nSinglepass','Forward\nMultipass','Coefs.','SHAP']*3

xlabels += ['Backward\nSinglepass','Forward\nMultipass','Gini','SHAP']


panels = [(m, n) for m, n in zip(methods, names)]
fig = explainer.plot_importance(data=data, panels=panels, 
                                num_vars_to_plot=10, 
                                display_feature_names=display_feature_names,
                                feature_colors=feature_colors,
                                plot_correlated_features=False, 
                                n_columns=4,
                                xlabels = xlabels,
                                ylabels = ['Tornado', 'Hail', 'Wind', "Road Surface"],
                                figsize=(12, 12), 
                                base_font_size= 12,
                                wspace=1.0
                               )