In [2]:
import os, sys
sys.path.append(os.path.abspath('../utils'))
import pandas as pd
from sklearn.linear_model import LogisticRegression
import dca_fs_tools as dcat
import pickle
import numpy as np

# An iterative method based on full model feature importance

We implement an itterative method of feature selection based on net benefit which is similar to a forward-stepwise selection method except that the order of inclusion of feature is fixed by their importance in a full model.

Features are itteratively included and models re-fit. At each step the net-benefit (or mean net benefit over all threshold probabilities) is calculated. The optimum set of features is that which maximizes net benefit.

This example is based on the scikit learn make_classification synthetic data set described in [00_synthetic_data_description.ipynb](./00_synthetic_data_description.ipynb).

Generate the synthetic dataset:

In [3]:
n_sample = 1000

df_train, df_test, ind_var_names = dcat.make_class_dataset(n_sample = n_sample,
                                       n_features = 5,
                                       n_redundant = 0,
                                       random_state = 1001,
                                       n_informative = 4,
                                       n_clusters_per_class = 1,
                                       n_classes = 2)

Define the logistic regression model (this could be any type of model):

In [4]:
logreg = LogisticRegression(C=10**18)

Define a function for performing the itterative approach. The steps are described in code comments:

In [5]:
def feature_importance_dcafs(train_data, test_data, model, harms_dict, dependent="y", nb_thresholds = [0.9, 0.6]):

    independent = harms_dict.keys()

    #identify feature imporrtances with the regularization path (other methods are possible)
    skl_path = dcat.skl_reg_path(train_data,
                                 test_data,
                                log_space_min = 0,
                                log_space_max = 3.5,
                                log_space_steps = 64,
                                nb_thresholds = nb_thresholds)

    # order featuures by l1 regularization
    feature_imp = (skl_path.loc[:, independent]==0).astype(int).sum(axis=0).sort_values().index.to_list()

    # itterate through the features fitting a model for each subset
    # Calculate the mean net benefit for each model select the modle
    # that maximizes mean net benefit
    thresholds = np.asarray(nb_thresholds)
    nb_threshs_cols = ["net_benefit_pt_"+ str(i) for i, j in enumerate(nb_thresholds)]

    out = pd.DataFrame(columns = ["features", "mnb"] + nb_threshs_cols)
    for i in range(len(independent)):
        model_features = feature_imp[0:i+1]
        
        # fit full model
        model.fit(train_data[model_features], train_data[dependent])

        # Make predictions on the test set:
        pred = model.predict_proba(test_data[model_features])

        # mnb
        mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']

        # net benefit at specific thresholds
        nb_thresh = dcat.net_benefit(test_data[dependent], pred[:, 1] , thresholds = thresholds)[1]
        

        #Include test harms
        harm = sum([test_harms[i] for i in model_features]) 

        mnb = mnb - harm

        nb_thresh = [i - harm for i in nb_thresh]


        nb_thresh_out = pd.Series(dict(zip(nb_threshs_cols, nb_thresh)))

        out.loc[i] = pd.concat([
            pd.Series({"features": model_features, "mnb": mnb}),
            nb_thresh_out
        ]
        )

    return out
        


Apply the method:

In [12]:
test_harms = {"x0": 0.0, "x1": 0.015, "x2": 0.03, "x3": 0.045, "x4": 0.06 }

forward_selection = feature_importance_dcafs(df_train, df_test, logreg, test_harms, nb_thresholds = [0.8, 0.2])

# Save for later comparison to other methods

with open('../data/forward_selection.pkl', 'wb') as f:
    pickle.dump(forward_selection, f)

forward_selection.rename(columns = {"net_benefit_pt_0": "net_benefit p_t=0.8",
                                    "net_benefit_pt_1": "net_benefit p_t=0.2"})


Unnamed: 0,features,mnb,net_benefit p_t=0.8,net_benefit p_t=0.2
0,[x1],0.209659,0.03,0.3475
1,"[x1, x4]",0.219324,0.105,0.32375
2,"[x1, x4, x3]",0.1786,0.09,0.285
3,"[x1, x4, x3, x0]",0.172733,0.08,0.28875
4,"[x1, x4, x3, x0, x2]",0.142756,0.055,0.2575


### Conclusion

We see that the maximum net benefit occurs when features $\{ x1, x4\}$ are selected, in this case independently of probability thresholds we tested or the use of mean net benefit. 