In [1]:
# Box and Whisker plot for the top 5 predictors 
# pink line for a given example. 
import numpy as np
import pandas as pd
import joblib
import sys, os
sys.path.insert(0, '/home/monte.flora/python_packages/WoF_post')
sys.path.insert(0, '/home/monte.flora/python_packages/ml_workflow')
sys.path.insert(0, '/home/monte.flora/python_packages/wofs_ml_severe')

from wofs_ml_severe.io.load_ml_models import load_ml_model
from wofs.post.utils import load_yaml

lookup_file: /home/monte.flora/python_packages/WoF_post/wofs/data/psadilookup.dat
lookup_file: /home/monte.flora/python_packages/WoF_post/wofs/data/psadilookup.dat


In [2]:
df = pd.read_feather('/work/mflora/ML_DATA/DATA/wofs_ml_severe__first_hour__reduced_data.feather')

In [3]:
def fix_data(X): 
    X = X.astype({'Initialization Time' : str})
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.reset_index(inplace=True, drop=True)
    
    return X 

In [20]:
# Load the old dataset. 
time = 'first_hour'
target = 'wind_severe_0km'

ml_config = load_yaml(
    '/home/monte.flora/python_packages/wofs_ml_severe/wofs_ml_severe/conf/ml_config_realtime.yml')
parameters = {
                'target' : target,
                'time' : time, 
                'drop_opt' : '',
                'model_name' : 'Average',
                'ml_config' : ml_config,
            }

model_dict = load_ml_model(**parameters)
features = model_dict['features']
model = model_dict['model']

X_test = df[features]
X_test = fix_data(X_test)
df = fix_data(df)

In [5]:
def just_transforms(model, X):
    """Applies all transforms to the data, without applying last 
       estimator.

    Parameters
    ----------
    X : iterable
        Data to predict on. Must fulfill input requirements of first step of
        the pipeline.
    """
    Xt = X
    for name, transform in model.steps[:-1]:
        Xt = transform.transform(Xt)
    return Xt

In [6]:
preds = model.predict_proba(X_test)[:,1]
top = np.argsort(preds)[::-1][::5]
print(top)

[808212 808110 808374 ... 803359 868062 803089]


In [31]:
import json 
X_rlt = X_test.iloc[[36426,36481, 36721,5599,61986,56836],:]

def lr_inputs(model, X):
    """Compute the product of the model coefficients and processed inputs (e.g., scaling)."""
    # Scale the inputs. 
    base_est = model.estimators[0].calibrated_classifiers_[0].base_estimator
    Xt = just_transforms(base_est, X)
    # Get the model coefficients. 
    coef = base_est.named_steps['model'].coef_[0,:]
    
    inputs = coef*Xt
    
    return inputs

def get_top_features(inputs, X, ind, features):
    """Using the LR coefficients, determine the top 5 predictors and their values."""
    # Get the absolute values. The len(features)
    abs_inputs = np.absolute(inputs[len(features):])
    
    # Sort the values and get the highest values. 
    sorted_indices = np.argsort(abs_inputs)[::-1]

    top_features = np.array(features)[sorted_indices][:5]
    top_values = X[top_features].values[ind]
    
    return top_features, top_values 
    

def generate_explainability_json(model, target, features, dataframe, ml_config, ensemble_track_file, 
                                ): 
    # Save subset of data for the explainability graphics. 
    subset_fname = ensemble_track_file.replace('ENSEMBLETRACKS', 'LOCALEXPLAIN').replace('.nc', '.json') 
    
    # Load the round_dict 
    json_file = os.path.join(
        '/home/monte.flora/python_packages/wofs_ml_severe/wofs_ml_severe', 
        'json', f'min_max_vals_{target}.json' )
    
    with open(json_file) as f:
        results = json.load(f)
    
    round_dict = {f : results[f]['round_int'] for f in features}
    
    #if self.TEMP:
    #    dataframe['0-3km_lapse_rate']/=-3.0
    #    dataframe['500-700mb_lapse_rate']/=-2.67765
    
    metadata = dataframe[['label', 'obj_centroid_x', 'obj_centroid_y']]

    inputs = lr_inputs(model, dataframe[features])
    # Round the data. 
    dataframe = dataframe.round(round_dict)
    
    results = [get_top_features(inputs[i,:],dataframe,i, features) for i in range(inputs.shape[0])]
    
    top_features = np.array([r[0] for r in results])
    top_values = np.array([r[1] for r in results])
    
    val_df = pd.DataFrame(top_values, columns=[f'Feature Val {i+1}' for i in range(5)])
    feature_df = pd.DataFrame(top_features, columns=[f'Feature Name {i+1}' for i in range(5)])
    
    total_df = pd.concat([val_df, feature_df, metadata], axis=1)

    print(f'Saving {subset_fname}...')
    #total_df.to_json(subset_fname)
    
    return total_df, val_df

In [32]:
dataframe = df.iloc[[36426,36481, 36721,5599, 61986,56836],:]

dataframe.reset_index(inplace=True, drop=True)

#dataframe['labels'] = np.arange(len(dataframe))
dataframe['obj_centroid_x'] = np.arange(len(dataframe))+100
dataframe['obj_centroid_y'] = np.arange(len(dataframe))+50

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['obj_centroid_x'] = np.arange(len(dataframe))+100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['obj_centroid_y'] = np.arange(len(dataframe))+50


In [33]:
ensemble_track_file = '/work/mflora/SummaryFiles/20210504/2200/wofs_ENSEMBLETRACKS_12_20210504_2230_2300.nc'
df, val_df = generate_explainability_json(model, target, features, dataframe, ml_config, ensemble_track_file)

Saving /work/mflora/SummaryFiles/20210504/2200/wofs_LOCALEXPLAIN_12_20210504_2230_2300.json...
