This notebook can be used to learn to fit EBMRegressor and to save key attributes of the model. Then we load the key attributes of the model and see if we can replicate the predition without using interpretML library. High Level comments on the parameters saved and how the prediction algorithm works is also present. Be sure to read the comments

In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit 
import json

## Prepare the data

In [2]:
input_file = r'Related_files_L1_input.json'
prediction_parameter_file = 'prediction_parameter_file.json'

In [3]:
df=pd.read_json(input_file,lines=True)

In [4]:
splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state = 7)
split = splitter.split(df, groups=df['topic_id'])
train_inds, test_inds = next(split)

train_df = df.iloc[train_inds]
test_df = df.iloc[test_inds]

In [5]:
print("Length of train data" , len(train_df))
print("Length of test data" , len(test_df))

Length of train data 7351
Length of test data 3088


In [7]:
# Keeping it simple and using 7 features
train_features =['TitleQuery_TokenMatchScore','TitleQuery_FullMatchScore','EditorMatchScore','ViewsLifeTimeScore',
                 'Title_UnmatchedTokenCountScore','FileTypeScore','Title_GoodTitleKeywordsScore',]
X_train=train_df[train_features]
X_test=test_df[train_features]
y_train=train_df[['RelatedDocumentLabel']]
y_test=test_df[['RelatedDocumentLabel']]
(len(X_train),len(y_train))

(7351, 7351)

## Fit EBM

In [8]:
from interpret.glassbox import ExplainableBoostingRegressor
ebm = ExplainableBoostingRegressor()
ebm.fit(X_train, y_train)

ExplainableBoostingRegressor(feature_names=['TitleQuery_TokenMatchScore',
                                            'TitleQuery_FullMatchScore',
                                            'EditorMatchScore',
                                            'ViewsLifeTimeScore',
                                            'Title_UnmatchedTokenCountScore',
                                            'FileTypeScore',
                                            'Title_GoodTitleKeywordsScore',
                                            'TitleQuery_FullMatchScore x '
                                            'ViewsLifeTimeScore',
                                            'TitleQuery_TokenMatchScore x '
                                            'ViewsLifeTimeScore',
                                            'TitleQuery_FullMatchScore x '
                                            'T...
                                            'FileTypeScore',
                                       

## Predict scores on a small test set

In [42]:
X_test_small = X_test[0:3]
scores  = ebm.predict(X_test_small)

In [49]:
scores_ex,explanation_ex = ebm.predict_and_contrib(X_test_small)

In [15]:
scores == scores_ex

array([ True,  True,  True])

## Save required parameters for independent predict

In [16]:
preprocessor_col_bin_edges_ = dict()
for k,v in ebm.preprocessor_.col_bin_edges_.items():
    preprocessor_col_bin_edges_[k] = v.tolist()

pair_preprocessor_col_bin_edges_ = dict()
for k,v in ebm.pair_preprocessor_.col_bin_edges_.items():
    pair_preprocessor_col_bin_edges_[k] = v.tolist()

In [17]:
required_data_for_prediction= dict()
required_data_for_prediction['feature_groups_']= ebm.feature_groups_ # 7 individual single features plus 10 interactions
required_data_for_prediction['additive_terms_']= [x.tolist() for x in ebm.additive_terms_]  # for each bin in continuos variable or each categorical value, it has a number or value associated with it. Shape [# of features +feature pair,bins of each category]
required_data_for_prediction['intercept_']= ebm.intercept_
required_data_for_prediction['preprocessor_col_types_']= ebm.preprocessor_.col_types_ # tells whether each col is continuous, ordinal or categorical.
required_data_for_prediction['preprocessor_col_bin_edges_']= preprocessor_col_bin_edges_ # each continuous variable has bins for it as key value pair. the anything less than that value but greater than previous one belogns to that bin
required_data_for_prediction['preprocessor_col_mapping_']= ebm.preprocessor_.col_mapping_ # # features 1 & 2 are categorical. For these categorial values, if mapping: {'0.0': 1, '1.0': 2} then col_value of 0 & 0.0 will change to 1 , dtype:float
required_data_for_prediction['preprocessor_binning']= ebm.preprocessor_.binning  # in prediction this is mainly used for dp stuff to see if we need to go to private if block
required_data_for_prediction['preprocessor_missing_str']= ebm.preprocessor_.missing_str # what missing value is represented as in categorial variables. In preprocessing missing_str is mapped to 0
required_data_for_prediction['feature_types']=ebm.feature_types # whether type is continuous,categorical or categorical
required_data_for_prediction['feature_names'] = ebm.feature_names # feature names. Include single features and pairwise interactions
required_data_for_prediction['interactions'] = ebm.interactions # no of pair-wise interactions in the model
required_data_for_prediction['pair_preprocessor_col_types_']= ebm.pair_preprocessor_.col_types_
required_data_for_prediction['pair_preprocessor_col_bin_edges_']= pair_preprocessor_col_bin_edges_
required_data_for_prediction['pair_preprocessor_col_mapping_']= ebm.pair_preprocessor_.col_mapping_
required_data_for_prediction['pair_preprocessor_binning']= ebm.pair_preprocessor_.binning
required_data_for_prediction['pair_preprocessor_missing_str']= ebm.pair_preprocessor_.missing_str

In [18]:
with open(prediction_parameter_file,'w',encoding='utf-8') as paf:
    paf.write(json.dumps(required_data_for_prediction))

## Predict without ebm or interpretMl Library

- First in the parent folder create a folder called lib and copy the dll lib_ebm_native_win_x64.dll  . It has a function required for assign the feature value to a particular bin for that feature
- Create a folder glassbox/ebm and add the file internal.py . I think this path & structure is hard coded somewhere

In [38]:
from glassbox.ebm.internal import Native 
#from "C:\InstalledPrograms\Anaconda3\Lib\site-packages\interpret_core-0.2.7-py3.8.egg\interpret\glassbox\ebm\internal.py"
# create folder lib and add dlls from C:\InstalledPrograms\Anaconda3\Lib\site-packages\interpret_core-0.2.7-py3.8.egg\interpret\lib
import numpy as np
import numbers

### Function to arrange & clean features like train set

In [21]:
from pandas.core.generic import NDFrame
# TODO: Docs for unify_data.
def unify_data(data, labels=None, feature_names=None, feature_types=None, missing_data_allowed=False):
    """ Attempts to unify data into a numpy array with feature names and types.

    If it cannot unify, returns the original data structure.

    Args:
        data:
        labels:
        feature_names:
        feature_types:

    Returns:

    """
    # TODO: Clean up code to have less duplication.
    if isinstance(data, NDFrame):
        # NOTE: Workaround for older versions of pandas.
        try:
            new_data = data.to_numpy()
        except AttributeError:  # pragma: no cover
            new_data = data.values

        if feature_names is None:
            new_feature_names = list(data.columns)
        else:
            new_feature_names = feature_names

        if feature_types is None:
            # unique_counts = np.apply_along_axis(lambda a: len(set(a)), axis=0, arr=data)
            bool_indicator = [data[col].isin([np.nan, 0, 1]).all() for col in data.columns]
            new_feature_types = [
                _assign_feature_type(feature_type, bool_indicator[index])
                for index, feature_type in enumerate(data.dtypes)
            ]
        else:
            new_feature_types = feature_types
    elif isinstance(data, list):
        new_data = np.array(data)

        new_feature_names = _get_new_feature_names(new_data, feature_names)
        new_feature_types = _get_new_feature_types(
            new_data, feature_types, new_feature_names
        )
    elif isinstance(data, np.ndarray):
        new_data = data

        new_feature_names = _get_new_feature_names(data, feature_names)
        new_feature_types = _get_new_feature_types(
            data, feature_types, new_feature_names
        )
    elif sp.sparse.issparse(data):
        # Add warning message for now prior to converting the data to dense format
        warn_msg = (
            "Sparse data not fully supported, will be densified for now, may cause OOM"
        )
        warnings.warn(warn_msg, RuntimeWarning)
        new_data = data.toarray()

        new_feature_names = _get_new_feature_names(new_data, feature_names)
        new_feature_types = _get_new_feature_types(
            new_data, feature_types, new_feature_names
        )
    else:  # pragma: no cover
        msg = "Could not unify data of type: {0}".format(type(data))
        log.error(msg)
        raise ValueError(msg)

    new_labels = unify_vector(labels)

    # NOTE: Until missing handling is introduced, all methods will fail at data unification stage if present.
    new_data_has_na = (
        True if new_data is not None and pd.isnull(new_data).any() else False
    )
    new_labels_has_na = (
        True if new_labels is not None and pd.isnull(new_labels).any() else False
    )

    if (new_data_has_na and not missing_data_allowed) or new_labels_has_na:
        msg = "Missing values are currently not supported."
        log.error(msg)
        raise ValueError(msg)

    return new_data, new_labels, new_feature_names, new_feature_types

def unify_vector(data):
    if data is None:
        return None

    if isinstance(data, Series):
        new_data = data.values
    elif isinstance(data, np.ndarray):
        if data.ndim > 1:
            new_data = data.ravel()
        else:
            new_data = data
    elif isinstance(data, list):
        new_data = np.array(data)
    elif isinstance(data, NDFrame) and data.shape[1] == 1:
        new_data = data.iloc[:, 0].values
    else:  # pragma: no cover
        msg = "Could not unify data of type: {0}".format(type(data))
        log.warning(msg)
        raise Exception(msg)

    return new_data

### Function to transform the input feature value

It takes in feature values and converts them to bin indices

In [19]:
def preprocessor_transform(X_preprocessor_transform_input, col_bin_edges_, col_types_, col_mapping_, binning, missing_str):
    missing_constant = 0
    unknown_constant = -1
    native = Native.get_native_singleton()
    X_new = np.copy(X_preprocessor_transform_input)
    if issubclass(X_preprocessor_transform_input.dtype.type, np.unsignedinteger):
        X_new = X_new.astype(np.int64)

    for col_idx in range(X_preprocessor_transform_input.shape[1]):
            col_type = col_types_[col_idx]
            col_data = X_preprocessor_transform_input[:, col_idx]

            if col_type == "continuous":
                col_data = col_data.astype(float)
                cuts = np.array(col_bin_edges_[str(col_idx)])
                discretized = native.discretize(col_data, cuts) # this basically says which bucket the col_data 
                #value belongs to. If col_value is 1 and the buckets are [0.5,1.5,2.5], then the discretized value will be 1 for that value
                X_new[:, col_idx] = discretized

            elif col_type == "ordinal":
                mapping = col_mapping_[col_idx].copy()
                vec_map = np.vectorize(
                    lambda x: mapping[x] if x in mapping else unknown_constant
                )
                X_new[:, col_idx] = vec_map(col_data)
            elif col_type == "categorical":
                mapping = col_mapping_[str(col_idx)].copy()

                # Use "DPOther" bin when possible to handle unknown values during DP.
                if "private" in binning:
                    for key, val in mapping.items():
                        if key == "DPOther": 
                            unknown_constant = val
                            missing_constant = val
                            break
                        else: # If DPOther keyword doesn't exist, revert to standard encoding scheme
                            missing_constant = 0
                            unknown_constant = -1

                if isinstance(missing_str, list):
                    for val in missing_str:
                        mapping[val] = missing_constant
                else:
                    mapping[missing_str] = missing_constant

                col_data = col_data.astype('U')
                X_new[:, col_idx] = np.fromiter(
                    (mapping.get(x, unknown_constant) for x in col_data), dtype=np.int64, count=X_preprocessor_transform_input.shape[0]
                )

    return X_new.astype(np.int64)      

### Preproccess individual and pair features separately

In [20]:
pred_para = dict()
with open(prediction_parameter_file,'r') as f:
    for line in f:
        pred_para=json.loads(line)

In [22]:
X_orig, _, _, _ = unify_data(X_test_small, None, pred_para['feature_names'], pred_para['feature_types'], missing_data_allowed=False)

In [23]:
# common keys
feature_names = pred_para['feature_names'] # feature names. Include single features and pairwise interactions
interactions =pred_para['interactions'] # no of pair-wise interactions in the model
feature_types = pred_para['feature_types'] # whether type is continuous,categorical or categorical

In [24]:
# keys specific to single feature GAMS
# all preprocessor keys are only for single features
preprocessor_col_bin_edges_ = pred_para['preprocessor_col_bin_edges_'] # each continuous variable has bins for it as key value pair. the anything less than that value but greater than previousone belogns to that bin
preprocessor_col_types_ = pred_para['preprocessor_col_types_'] # tells whether each col is continuous, ordinal or categorical. 
preprocessor_col_mapping_=pred_para['preprocessor_col_mapping_'] # features 1 & 2 are categorical. For these categorial values, if mapping: {'0.0': 1, '1.0': 2} then col_value of 0 & 0.0 will change to 1 , dtype:float
preprocessor_binning= pred_para['preprocessor_binning']  # in prediction this is mainly used for dp stuff to see if we need to go to private if block
preprocessor_missing_str = pred_para['preprocessor_missing_str'] # what missing value is represented as in categorial variables. In preprocessing this is mapped to 0

In [25]:
# keys specific to pairwise feature GAMS
pair_preprocessor_col_bin_edges_ = pred_para['pair_preprocessor_col_bin_edges_']
pair_preprocessor_col_types_ = pred_para['pair_preprocessor_col_types_']
pair_preprocessor_col_mapping_=pred_para['pair_preprocessor_col_mapping_']
pair_preprocessor_binning= pred_para['pair_preprocessor_binning']
pair_preprocessor_missing_str = pred_para['pair_preprocessor_missing_str']

In [30]:
# Individual features
X = preprocessor_transform(X_orig, preprocessor_col_bin_edges_, preprocessor_col_types_, preprocessor_col_mapping_, preprocessor_binning, preprocessor_missing_str)
X = np.ascontiguousarray(X.T) # each column is a record with shape(#of features,# of records)

In [33]:
if interactions >0 : # max interactions are set in max_interaction_bins. Default is 32
    X_pair = preprocessor_transform(X_orig, pair_preprocessor_col_bin_edges_, pair_preprocessor_col_types_, pair_preprocessor_col_mapping_, pair_preprocessor_binning, pair_preprocessor_missing_str)
    X_pair = np.ascontiguousarray(X_pair.T) # each column is a record with shape(#of features,# of records)
else:
    X_pair = None

## Predict & score Function

The algorithm at a high level is 
- Using information from preproccessor & pair_preprocessor above , for each feature we can discretize it into bins
- additive_terms_ contains the value of for each bin for each feature & feature-pair.
- Score is simply the sum of score values


To Do:<br>
- Find out how pair features are discretized

In [34]:
def scores_by_feature_group(X, X_pair, feature_groups, model):
        for set_idx, feature_group in enumerate(feature_groups):
            tensor = model[set_idx]

            # Get the current column(s) to process
            feature_idxs = feature_group

            if X_pair is not None:
                sliced_X = X[feature_idxs, :] if len(feature_group) == 1 else X_pair[feature_idxs, :]
            else:
                sliced_X = X[feature_idxs, :]

            scores = tensor[tuple(sliced_X)]

            # Reset scores from unknown (not missing!) indexes to 0
            # this assumes all logits are zero weighted centered, and ideally tensors are purified

            unknowns = (sliced_X < 0).any(axis=0) 
            scores[unknowns] = 0# negative values are replaced with zero

            yield set_idx, feature_group, scores
            
def decision_function(X, X_pair, feature_groups, model, intercept):
        if X.ndim == 1:
            X = X.reshape(X.shape[0], 1)

        # Initialize empty vector for predictions
        if isinstance(intercept, numbers.Number) or len(intercept) == 1:
            score_vector = np.empty(X.shape[1]) # assign empty random values with shape equal to number of records
            
        else:
            score_vector = np.empty((X.shape[1], len(intercept)))

        np.copyto(score_vector, intercept) # assign the value of intercept 
        
        # Generate prediction scores
        scores_gen = scores_by_feature_group(
            X, X_pair, feature_groups, model
        ) # value of each feature + feature pair according to the bin it was placed in 
        for _, _, scores in scores_gen:
            score_vector += scores

        if not np.all(np.isfinite(score_vector)):  # pragma: no cover
            msg = "Non-finite values present in log odds vector."
            log.error(msg)
            raise Exception(msg)

        return score_vector



def regressor_predict(X, X_pair, feature_groups, model, intercept):
        scores = decision_function(X, X_pair, feature_groups, model, intercept)
        return scores

In [36]:
feature_groups_ = pred_para['feature_groups_'] # 7 individual single features plus 10 interactions
additive_terms_ = [np.array(x) for x in  pred_para['additive_terms_']] # for each bin in continuos variable or each categorical value, it has a number or value associated with it. Shape [# of features +feature pair,bins of each category]
intercept_ = pred_para['intercept_']

In [43]:
scores_wihtout_interpret = regressor_predict(X, X_pair, feature_groups_, additive_terms_, intercept_)

In [44]:
scores == scores_wihtout_interpret

array([ True,  True,  True])

## Mostly similar function which also keeps record of explanation

In [45]:
def decision_function_and_explain(X, X_pair, feature_groups, model, intercept):
    if X.ndim == 1:
        X = X.reshape(X.shape[0], 1)

    # Initialize empty vector for predictions and explanations
    if isinstance(intercept, numbers.Number) or len(intercept) == 1:
        score_vector = np.empty(X.shape[1])
    else:
        score_vector = np.empty((X.shape[1], len(intercept)))

    np.copyto(score_vector, intercept)

    n_interactions = sum(len(fg) > 1 for fg in feature_groups)
    explanations = np.empty((X.shape[1], X.shape[0] + n_interactions))

    # Generate prediction scores
    scores_gen = scores_by_feature_group(
        X, X_pair, feature_groups, model
    )
    for set_idx, _, scores in scores_gen:
        score_vector += scores
        explanations[:, set_idx] = scores

    if not np.all(np.isfinite(score_vector)):  # pragma: no cover
        msg = "Non-finite values present in log odds vector."
        log.error(msg)
        raise Exception(msg)

    return score_vector, explanations
    
def regressor_predict_and_contrib(X, X_pair, feature_groups, model, intercept):
    scores, explanations = decision_function_and_explain(
        X,
        X_pair,
        feature_groups,
        model,
        intercept
    )
    return scores, explanations

In [47]:
scores_wihtout_interpret_e, explanations_wihtout_interpret = regressor_predict_and_contrib(X, X_pair, feature_groups_, additive_terms_, intercept_)

In [48]:
scores == scores_wihtout_interpret_e

array([ True,  True,  True])

In [50]:
explanation_ex == explanations_wihtout_interpret

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True]])