In [1]:
import os, sys
sys.path.append(os.path.abspath('../utils'))

import pandas as pd
import numpy as np
import random
import math
import scipy
from scipy import stats

from sklearn.preprocessing import StandardScaler  # for feature scaling
from sklearn.model_selection import train_test_split  # for train/test split
import matplotlib.pyplot as plt  #For representation
import matplotlib.gridspec as gridspec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import l1_min_c  # for L1 regluarization path

# Example code for calculating accuracy, precision, recall, and F1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
#from statkit.decision import net_benefit
from statkit.decision import net_benefit
from scipy import integrate


import dca_fs_tools as dcat
import dca_fs as dcafs

import torch
import copy

import pickle

# Backward stepwise selection based on mean net benefit

We implement a backward stepwise selection procedure based on net benefit.

Features are itteratively removed from the full model with the stopping rule that the model with maximum net benefit (or mean net benefit across all threshold probabilities).

This example is based on the scikit learn make_classification synthetic data set described in [00_synthetic_data_description.ipynb](./00_synthetic_data_description.ipynb).

Generate the synthetic data set:

In [2]:
n_sample = 1000

df_train, df_test, ind_var_names = dcat.make_class_dataset(n_sample = n_sample,
                                       n_features = 5,
                                       n_redundant = 0,
                                       random_state = 1001,
                                       n_informative = 4,
                                       n_clusters_per_class = 1,
                                       n_classes = 2)

Define the logistic regression model (this could be any type of model).

In [3]:
logreg = LogisticRegression(C=10**18)

Define a function for performing the backward stepwise selection procedure. The steps are described in code comments:

In [4]:
def backward_stepwise_dcafs(train_data, test_data, model, harms_dict, dependent="y"):

    features_left = test_harms.keys()

    # Initialize with full model
    out = pd.DataFrame(columns = ["features", "mnb"])
    model.fit(train_data[features_left], train_data[dependent])
    pred = model.predict_proba(test_data[features_left])
    mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']
    harm = sum([test_harms[i] for i in features_left]) 
    mnb = mnb - harm
    out.loc[0] = pd.Series({"features": list(features_left), "mnb": mnb})

    for n_dropped in range(len(test_harms.keys())-1):
        #print(n_dropped)

        mnb_per_drop = pd.DataFrame(columns = ["droped_feature", "model_features","mnb"])
        for i, droped_feature in enumerate(features_left):
            #build a model with  the looped feature removed
    
            model_features = [i for i in features_left if not(i == droped_feature)]

            # build a model with the selected features
            model.fit(train_data[model_features], train_data[dependent])

            ## Make predictions on the test set:
            pred = model.predict_proba(test_data[model_features])

            # auc
            #auc = roc_auc_score(df_test[dependent],pred[:, 1])

            # mnb
            mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']

            #Include test harms
            harm = sum([test_harms[i] for i in model_features]) 

            mnb = mnb - harm

            mnb_per_drop.loc[i] = pd.Series({"droped_feature": droped_feature, "model_features": model_features, "mnb": mnb})


        excluded_feature = mnb_per_drop[mnb_per_drop['mnb']==mnb_per_drop['mnb'].max()]
    
        ef = excluded_feature["droped_feature"].to_list()
        features_left = [f for f in features_left if not f in ef]
        mnb = excluded_feature["mnb"].to_list()[0]
        out.loc[n_dropped+1] = pd.Series({"features": features_left, "mnb": mnb})



    return out

Run the selection procedure on the synthetic data:

In [5]:
test_harms = {"x0": 0.0, "x1": 0.015, "x2": 0.03, "x3": 0.045, "x4": 0.06 }

backward_selection = backward_stepwise_dcafs(df_train, df_test, logreg, test_harms)

# Save for later comparison to other methods

with open('../data/backward_selection.pkl', 'wb') as f:
    pickle.dump(backward_selection, f)

backward_selection

Unnamed: 0,features,mnb
0,"[x0, x1, x2, x3, x4]",0.142756
1,"[x0, x1, x2, x4]",0.180113
2,"[x0, x1, x4]",0.212192
3,"[x1, x4]",0.219324
4,[x1],0.209659


### Conclusion

We see that the maximum net benefit occurs when features $\{ x1, x4\}$ are selected.