In [2]:
import os, sys
sys.path.append(os.path.abspath('../utils'))

import pandas as pd
import numpy as np
import random
import math
import scipy
from scipy import stats

from sklearn.preprocessing import StandardScaler  # for feature scaling
from sklearn.model_selection import train_test_split  # for train/test split
import matplotlib.pyplot as plt  #For representation
import matplotlib.gridspec as gridspec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import l1_min_c  # for L1 regluarization path

# Example code for calculating accuracy, precision, recall, and F1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
#from statkit.decision import net_benefit
from statkit.decision import net_benefit
from scipy import integrate


import dca_fs_tools as dcat
import dca_fs as dcafs

import torch
import copy

import pickle

In [3]:
n_sample = 1000

df_train, df_test, ind_var_names = dcat.make_class_dataset(n_sample = n_sample,
                                       n_features = 5,
                                       n_redundant = 0,
                                       random_state = 1001,
                                       n_informative = 4,
                                       n_clusters_per_class = 1,
                                       n_classes = 2)

In [4]:
logreg = LogisticRegression(C=10**18)

In [None]:

# fit full model
#logreg.fit(df_train[ind_var_names], df_train["y"])

# define feature harms

test_harms = {"x0": 0.0, "x1": 0.015, "x2": 0.03, "x3": 0.045, "x4": 0.06 }
 
dependent = "y"

#identify feature importances
#skl_path = dcat.skl_reg_path(df_train,
#                            log_space_min = 0,
#                            log_space_max = 3.5,
#                            log_space_steps = 64)

# order featuures by l1 regularization
#feature_imp = (skl_path.loc[:, ind_var_names]==0).astype(int).sum(axis=0).sort_values().index.to_list()

# itterate through the features fitting a model for each subset
# Calculate the mean net benefit for each model select the modle
# that maximizes mean net benefit

features_left = test_harms.keys()

# Initialize with full model
out = pd.DataFrame(columns = ["features", "mnb"])
logreg.fit(df_test[features_left], df_test["y"])
pred = logreg.predict_proba(df_test[features_left])
mnb = dcat.mean_net_benefit(df_test[dependent], pred[:, 1], n_thresh=100)['mnb']
harm = sum([test_harms[i] for i in features_left]) 
mnb = mnb - harm
out.loc[0] = pd.Series({"features": list(features_left), "mnb": mnb})

for n_dropped in range(len(test_harms.keys())-1):
    print(n_dropped)

    mnb_per_drop = pd.DataFrame(columns = ["droped_feature", "model_features","mnb"])
    for i, droped_feature in enumerate(features_left):
        #build a model with  the looped feature removed
    
        model_features = [i for i in features_left if not(i == droped_feature)]

        # build a model with the selected features
        logreg.fit(df_test[model_features], df_test["y"])

        ## Make predictions on the test set:
        pred = logreg.predict_proba(df_test[model_features])

        # auc
        #auc = roc_auc_score(df_test[dependent],pred[:, 1])

        # mnb
        mnb = dcat.mean_net_benefit(df_test[dependent], pred[:, 1], n_thresh=100)['mnb']

        #Include test harms
        harm = sum([test_harms[i] for i in model_features]) 

        mnb = mnb - harm

        mnb_per_drop.loc[i] = pd.Series({"droped_feature": droped_feature, "model_features": model_features, "mnb": mnb})
        #print(mnb_per_drop)
        #print(model_features, mnb)

    #print("=====================")
    #print(mnb_per_drop)
    #print("=====================")
    # drop the feature that results in the largest mean net benefit (i.e. has least impact on net benefit)
    #print(mnb_per_drop)
    excluded_feature = mnb_per_drop[mnb_per_drop['mnb']==mnb_per_drop['mnb'].max()]
    #print(excluded_feature["droped_feature"].to_list())
    ef = excluded_feature["droped_feature"].to_list()
    features_left = [f for f in features_left if not f in ef]
    mnb = excluded_feature["mnb"].to_list()[0]
    #update
    out.loc[n_dropped+1] = pd.Series({"features": features_left, "mnb": mnb})



out


0
1
2
3


Unnamed: 0,features,mnb
0,"[x0, x1, x2, x3, x4]",0.15341
1,"[x0, x1, x2, x4]",0.195265
2,"[x0, x1, x4]",0.22011
3,"[x1, x4]",0.224445
4,[x1],0.213858


In [19]:
def backward_stepwise_dcafs(train_data, test_data, model, harms_dict, dependent="y"):

    features_left = test_harms.keys()

    # Initialize with full model
    out = pd.DataFrame(columns = ["features", "mnb"])
    model.fit(train_data[features_left], train_data[dependent])
    pred = model.predict_proba(test_data[features_left])
    mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']
    harm = sum([test_harms[i] for i in features_left]) 
    mnb = mnb - harm
    out.loc[0] = pd.Series({"features": list(features_left), "mnb": mnb})

    for n_dropped in range(len(test_harms.keys())-1):
        #print(n_dropped)

        mnb_per_drop = pd.DataFrame(columns = ["droped_feature", "model_features","mnb"])
        for i, droped_feature in enumerate(features_left):
            #build a model with  the looped feature removed
    
            model_features = [i for i in features_left if not(i == droped_feature)]

            # build a model with the selected features
            model.fit(train_data[model_features], train_data[dependent])

            ## Make predictions on the test set:
            pred = model.predict_proba(test_data[model_features])

            # auc
            #auc = roc_auc_score(df_test[dependent],pred[:, 1])

            # mnb
            mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']

            #Include test harms
            harm = sum([test_harms[i] for i in model_features]) 

            mnb = mnb - harm

            mnb_per_drop.loc[i] = pd.Series({"droped_feature": droped_feature, "model_features": model_features, "mnb": mnb})


        excluded_feature = mnb_per_drop[mnb_per_drop['mnb']==mnb_per_drop['mnb'].max()]
    
        ef = excluded_feature["droped_feature"].to_list()
        features_left = [f for f in features_left if not f in ef]
        mnb = excluded_feature["mnb"].to_list()[0]
        out.loc[n_dropped+1] = pd.Series({"features": features_left, "mnb": mnb})



    return out

In [20]:
test_harms = {"x0": 0.0, "x1": 0.015, "x2": 0.03, "x3": 0.045, "x4": 0.06 }

test = backward_stepwise_dcafs(df_train, df_test, logreg, test_harms)

test

Unnamed: 0,features,mnb
0,"[x0, x1, x2, x3, x4]",0.142756
1,"[x0, x1, x2, x4]",0.180113
2,"[x0, x1, x4]",0.212192
3,"[x1, x4]",0.219324
4,[x1],0.209659
