In [1]:
import os, sys
sys.path.append(os.path.abspath('../utils'))

import pandas as pd
import numpy as np
import random
import math
import scipy
from scipy import stats

from sklearn.preprocessing import StandardScaler  # for feature scaling
from sklearn.model_selection import train_test_split  # for train/test split
import matplotlib.pyplot as plt  #For representation
import matplotlib.gridspec as gridspec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import l1_min_c  # for L1 regluarization path

# Example code for calculating accuracy, precision, recall, and F1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
#from statkit.decision import net_benefit
from statkit.decision import net_benefit
from scipy import integrate


import dca_fs_tools as dcat
import dca_fs as dcafs

import torch
import copy

import pickle

In [2]:
n_sample = 1000

df_train, df_test, ind_var_names = dcat.make_class_dataset(n_sample = n_sample,
                                       n_features = 5,
                                       n_redundant = 0,
                                       random_state = 1001,
                                       n_informative = 4,
                                       n_clusters_per_class = 1,
                                       n_classes = 2)

In [3]:
logreg = LogisticRegression(C=10**18)

In [4]:

# fit full model
#logreg.fit(df_train[ind_var_names], df_train["y"])

# define feature harms
test_harms = {"x0": 0.0, "x1": 0.015, "x2": 0.03, "x3": 0.045, "x4": 0.06 }
 
dependent = "y"

#identify feature imporrtances
skl_path = dcat.skl_reg_path(df_train,
                            log_space_min = 0,
                            log_space_max = 3.5,
                            log_space_steps = 64)

# order featuures by l1 regularization
feature_imp = (skl_path.loc[:, ind_var_names]==0).astype(int).sum(axis=0).sort_values().index.to_list()

# itterate through the features fitting a model for each subset
# Calculate the mean net benefit for each model select the modle
# that maximizes mean net benefit

for i in range(len(ind_var_names)):
    model_features = feature_imp[0:i+1]
    print(model_features)

    # fit full model
    logreg.fit(df_test[model_features], df_test["y"])

    # Make predictions on the test set:
    pred = logreg.predict_proba(df_test[model_features])

    # auc
    #auc = roc_auc_score(df_test[dependent],pred[:, 1])

    # mnb
    mnb = dcat.mean_net_benefit(df_test[dependent], pred[:, 1], n_thresh=100)['mnb']

    #Include test harms
    harm = sum([test_harms[i] for i in model_features]) 

    mnb = mnb - harm
    print(mnb)


['x1']
0.21385750755249888
['x1', 'x4']
0.2244450144712049
['x1', 'x4', 'x3']
0.18334457454593073
['x1', 'x4', 'x3', 'x0']
0.1782937341943237
['x1', 'x4', 'x3', 'x0', 'x2']
0.15340989796623714


In [5]:
def feature_importance_dcafs(train_data, test_data, model, harms_dict, dependent="y"):

    independent = harms_dict.keys()

    #identify feature imporrtances with the regularization path (other methods are possible)
    skl_path = dcat.skl_reg_path(train_data,
                                log_space_min = 0,
                                log_space_max = 3.5,
                                log_space_steps = 64)

    # order featuures by l1 regularization
    feature_imp = (skl_path.loc[:, independent]==0).astype(int).sum(axis=0).sort_values().index.to_list()

    # itterate through the features fitting a model for each subset
    # Calculate the mean net benefit for each model select the modle
    # that maximizes mean net benefit
    
    for i in range(len(independent)):
        model_features = feature_imp[0:i+1]
        
        # fit full model
        model.fit(test_data[model_features], test_data["y"])

        # Make predictions on the test set:
        pred = model.predict_proba(test_data[model_features])

        # auc
        #auc = roc_auc_score(df_test[dependent],pred[:, 1])

        # mnb
        mnb = dcat.mean_net_benefit(test_data[dependent], pred[:, 1], n_thresh=100)['mnb']

        #Include test harms
        harm = sum([test_harms[i] for i in model_features]) 

        mnb = mnb - harm
        


In [5]:

{"a": [1,2,3], "b": 6}.keys()


dict_keys(['a', 'b'])