In [1]:
import datetime
import pandas as pd

from src.functions import lmb_opt, lmb_opt_group, find_best_features_subset, find_best_group_subset
from src.plot_functions import score, plot_scores

# Metadata

In [3]:
BUDGET = 100
TOTAL_COST = 680

# Load data

### Subset with Your dataset

In [5]:
X_train = pd.read_csv('./../first_experiment/data/hypertension/X_train.csv')
y_train = pd.read_csv('./../first_experiment/data/hypertension/y_train.csv')['hypertension'].values.astype(int)
X_test = pd.read_csv('./../first_experiment/data/hypertension/X_test.csv')
y_test = pd.read_csv('./../first_experiment/data/hypertension/y_test.csv')['hypertension'].values.astype(int)

groups_df = pd.read_csv('./../first_experiment/data/costs_with_group_id.csv', header=0, sep=';')

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17795, 305), (1978, 305), (17795,), (1978,))

In [17]:
groups_df.sample(5)

Unnamed: 0,feature,costs,group_id,feature_id
85,Leukocytes [#/volume] in Blood median,2.0,22,85
117,Specific gravity of Urine by Test strip median,2.0,33,117
69,Platelets [#/volume] in Blood median,2.0,18,69
166,LLL Lung Sounds freq NOT CLEAR,9.0,49,166
303,Renal SOFA Score freq 1-2,12.0,87,303


In [9]:
group_costs = {}
for i,j in zip(groups_df['group_id'], groups_df['costs']):
    if i not in group_costs:
        group_costs[i] = j

normalized_costs = {}
min_cost = min(group_costs.values())
max_cost = max(group_costs.values())
for k,v in group_costs.items():
    normalized_costs[k] = (1 - 0.1) * (v - min_cost) / (max_cost - min_cost) + 0.1

groups = {}
for i,j in zip(groups_df['group_id'], groups_df['feature_id']):
    if i not in groups:
        groups[i] = [j]
    else:
        groups[i].append(j)

features_dict = {}
for i,j in zip(groups_df['feature_id'], groups_df['feature']):
    features_dict[i] = j


l_opt = 0.21224175566366077
l_opt_group = 4.198612604606511

if l_opt is None:
    l_opt = lmb_opt(X=X_train, y=y_train, groups=groups, group_costs=group_costs, normalized_costs=normalized_costs, budget=20, n=10, m=10000)
if l_opt_group is None:
    l_opt_group = lmb_opt_group(X=X_train, y=y_train, groups=groups, group_costs=group_costs, normalized_costs=normalized_costs, budget=20, n=10, m=5000)

print(l_opt)
print(l_opt_group)

0.21224175566366077
4.198612604606511


# Feature selection

### Traditional

In [None]:
features_trad_a1, costs_trad_a1 = find_best_features_subset(
    X_train,
    y_train,
    groups,
    group_costs,
    budget=BUDGET
)
df_traditional = pd.DataFrame({
        'feature_order' : features_trad_a1,
        'total_cost': costs_trad_a1
    })

### Cost-constrained (CC-SFS)

In [None]:
features_cs_sfs, costs_cs_sfs = find_best_features_subset(
    X_train,
    y_train,
    groups,
    group_costs,
    budget=BUDGET,
    lmb=l_opt
)
df_cc_sfs = pd.DataFrame({
        'feature_order' : features_cs_sfs,
        'total_cost': costs_cs_sfs
    })

### Cost-constrained (CC-GFS)

In [None]:
TAU = 0.8
features_cs_gfs, costs_cs_gfs = find_best_group_subset(
    X_train,
    y_train,
    groups,
    group_costs,
    budget=BUDGET,
    tau=TAU,
    lmb=l_opt_group
)
df_cc_gfs = pd.DataFrame({
    'feature_order' : features_cs_gfs,
    'total_cost': costs_cs_gfs
})


# Results

### Plot settings

In [None]:
dict_desc_names = {
    'traditional' : 'Traditional',
    'cc_sfs' : 'CC-SFS',
    'cc_gfs' : 'CC-GFS'
}

dict_colors = {
    'traditional' : 'red',
    'cc_sfs' : 'blue',
    'cc_gfs' : 'green'
}

dict_linestyle = {
    'traditional' : 'solid',
    'cc_sfs' : 'dashed',
    'cc_gfs' : 'dashed'
}

plot_settings = {}
plot_settings['names'] = dict_desc_names
plot_settings['colors'] = dict_colors
plot_settings['linestyles'] = dict_linestyle
plot_settings

### Scoring

In [None]:
scores = {}
scores['traditional'] = score(
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    feature_order = df_traditional['feature_order']
)
scores['cc_sfs'] = score(
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    feature_order = df_cc_sfs['feature_order']
)
scores['cc_gfs'] = score(
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    feature_order = df_cc_gfs['feature_order']
)

In [None]:
results_df = {}

results_df['traditional'] = pd.DataFrame({
    'feature_order' : df_traditional['feature_order'],
    'total_cost': df_traditional['total_cost'],
    'score' : scores['traditional']
}).groupby('total_cost', as_index=False).agg({'score':max})

results_df['cc_sfs'] = pd.DataFrame({
    'feature_order' : df_cc_sfs['feature_order'],
    'total_cost': df_cc_sfs['total_cost'],
    'score' : scores['cc_sfs']
}).groupby('total_cost', as_index=False).agg({'score':max})

results_df['cc_gfs'] = pd.DataFrame({
    'feature_order' : df_cc_gfs['feature_order'],
    'total_cost': df_cc_gfs['total_cost'],
    'score' : scores['cc_gfs']
}).groupby('total_cost', as_index=False).agg({'score':max})

### Plots

In [None]:
plot_scores(
    results_df=results_df,
    budget=68,
    total_cost=TOTAL_COST,
    plot_settings=plot_settings
)