In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from pulp import *

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
### Read in dataset
df_features = pd.read_csv('./data/all_uti_features.csv')
df_features.head()

Unnamed: 0,example_id,demographics - age,demographics - is_white,demographics - is_veteran,micro - prev resistance LVX 14,micro - prev resistance AMP 14,micro - prev resistance CFZ 14,micro - prev resistance NIT 14,micro - prev resistance GEN 14,micro - prev resistance SXT 14,...,selected micro - colonization pressure NIT 90 - overall,selected micro - colonization pressure OXA 90 - overall,selected micro - colonization pressure PEN 90 - overall,selected micro - colonization pressure SAM 90 - overall,selected micro - colonization pressure SXT 90 - overall,selected micro - colonization pressure TET 90 - overall,selected micro - colonization pressure TZP 90 - overall,selected micro - colonization pressure VAN 90 - overall,is_train,uncomplicated
0,0,54,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.24,0.56,0.49,0.38,0.26,0.37,0.06,0.13,0,0
1,1,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.19,0.78,0.83,0.33,0.2,0.35,0.18,0.13,1,0
2,2,89,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.21,0.66,0.51,0.5,0.25,0.37,0.05,0.12,1,0
3,3,49,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.23,0.47,0.5,0.25,0.26,0.35,0.06,0.12,1,0
4,4,21,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.24,0.5,0.49,0.33,0.25,0.37,0.05,0.1,1,1


In [7]:
df_features.shape

(116902, 791)

In [6]:
### Playground
for c in df_features.columns:
    print(c)

example_id
demographics - age
demographics - is_white
demographics - is_veteran
micro - prev resistance LVX 14
micro - prev resistance AMP 14
micro - prev resistance CFZ 14
micro - prev resistance NIT 14
micro - prev resistance GEN 14
micro - prev resistance SXT 14
micro - prev resistance TET 14
micro - prev resistance AMC 14
micro - prev resistance CIP 14
micro - prev resistance SAM 14
micro - prev resistance ERY 30
micro - prev resistance OXA 30
micro - prev resistance QUD 30
micro - prev resistance RIF 30
micro - prev resistance AMP 30
micro - prev resistance CFZ 30
micro - prev resistance PEN 30
micro - prev resistance LVX 30
micro - prev resistance CLI 30
micro - prev resistance DOX 30
micro - prev resistance NIT 30
micro - prev resistance TET 30
micro - prev resistance CIP 30
micro - prev resistance CRO 30
micro - prev resistance GEN 30
micro - prev resistance SXT 30
micro - prev resistance AMC 30
micro - prev resistance ATM 30
micro - prev resistance PIP 30
micro - prev resistan

In [40]:
df_features.shape

(116902, 791)

In [41]:
df_labels = pd.read_csv('./data/all_uti_resist_labels.csv')
print(df_labels.shape)
df_labels.head()

(116902, 7)


Unnamed: 0,example_id,NIT,SXT,CIP,LVX,is_train,uncomplicated
0,0,0.0,0.0,0.0,0.0,0,0
1,1,0.0,0.0,0.0,0.0,1,0
2,2,0.0,0.0,1.0,1.0,1,0
3,3,0.0,0.0,0.0,0.0,1,0
4,4,0.0,0.0,0.0,0.0,1,1


In [61]:
feature_cols = [col for col in df_features.columns if col not in ('example_id', 'is_train', 'uncomplicated')]

In [62]:
df = (df_labels
 .merge(df_features, how='inner', on=['example_id', 'is_train', 'uncomplicated'])
 .query("uncomplicated == 1", engine='python')
)
df_train = df.query("is_train == 1", engine='python')
df_test = df.query("is_train == 0", engine='python')
print(f"Training set: {df_train.shape}")
print(f"Test set: {df_test.shape}")


Training set: (11865, 795)
Test set: (3941, 795)


In [63]:
def gbm(X_train, y_train, X_test, y_test):
    """
    Trains a gbm with standard hyperparamters and returns AUROC on test set.
    """
    # Instantiate GBM
    gbm = lgb.LGBMClassifier(objective='binary',
                             n_estimators=1000,
                             learning_rate=0.1,
                             num_leaves=2)

    
    # Fit model with early stopping
    gbm.fit(X_train,
            y_train,
            eval_set= [(X_test, y_test)],
            eval_metric = 'binary',
            early_stopping_rounds = 20,
            verbose=False)
    
    boosting_rounds = gbm.best_iteration_
    preds = gbm.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds)
    print(f"Num Boosting Rounds: {boosting_rounds} AUC: {round(auc, 2)}")
    return gbm, preds


for label in ['NIT', 'SXT', 'CIP', 'LVX']:
    print(label)
    X_train, y_train = df_train[feature_cols], df_train[label]
    X_test, y_test = df_test[feature_cols], df_test[label]
    clf, preds = gbm(X_train, y_train, X_test, y_test)
    df_test[f"{label}_predictions"] = [p for p in preds]

NIT
Num Boosting Rounds: 92 AUC: 0.57
SXT


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Num Boosting Rounds: 168 AUC: 0.6
CIP


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Num Boosting Rounds: 56 AUC: 0.62
LVX


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Num Boosting Rounds: 50 AUC: 0.61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [64]:
df_prescriptions = pd.read_csv('./data/all_prescriptions.csv')
print(df_prescriptions.shape)
df_prescriptions.head()

(15806, 3)


Unnamed: 0,example_id,prescription,is_train
0,4,CIP,1
1,31,NIT,0
2,34,SXT,1
3,36,NIT,0
4,48,LVX,1


In [65]:
df_test = (df_test
    .merge(df_prescriptions, how='left', on=['example_id', 'is_train'])
)
df_test.head()

Unnamed: 0,example_id,NIT,SXT,CIP,LVX,is_train,uncomplicated,demographics - age,demographics - is_white,demographics - is_veteran,...,selected micro - colonization pressure SAM 90 - overall,selected micro - colonization pressure SXT 90 - overall,selected micro - colonization pressure TET 90 - overall,selected micro - colonization pressure TZP 90 - overall,selected micro - colonization pressure VAN 90 - overall,NIT_predictions,SXT_predictions,CIP_predictions,LVX_predictions,prescription
0,31,0.0,0.0,0.0,0.0,0,1,19,1.0,0.0,...,0.38,0.25,0.35,0.05,0.1,0.103699,0.149995,0.038274,0.037358,NIT
1,36,0.0,0.0,0.0,0.0,0,1,26,0.0,0.0,...,0.37,0.26,0.35,0.06,0.11,0.109622,0.22833,0.038274,0.037358,NIT
2,55,0.0,1.0,1.0,1.0,0,1,20,0.0,0.0,...,0.37,0.26,0.35,0.06,0.1,0.172013,0.297219,0.205737,0.192679,NIT
3,71,0.0,0.0,0.0,0.0,0,1,36,0.0,0.0,...,0.38,0.26,0.34,0.06,0.08,0.117121,0.221369,0.035283,0.035168,SXT
4,86,0.0,0.0,0.0,0.0,0,1,50,0.0,0.0,...,0.37,0.24,0.34,0.06,0.1,0.124062,0.198344,0.036835,0.036721,SXT


In [74]:
class AbxDecisionMaker():

    def __init__(self, df, abx_settings):
        self.df = df
        self.abx_settings = abx_settings
        self.abx_options = ['NIT', 'SXT', 'CIP', 'LVX']
        self.n = len(df)

    def compute_was_not_covered(self, x, decision_column='prescription'):
        """
        Given administered antibiotic, return whether antibiotic covered pateint
        """
        if decision_column == 'prescription':
            med_description = x.prescription
        elif decision_column == 'ip_prescription':
            med_description = x.ip_prescription 
        else:
            raise
        return x[med_description]

    def get_coverage_rates(self, df=None):
        """
        Create flag for whether clinicians covered the patient during the csn, whether
        a random assignemnt covered patient CSN, and whether optimized assignment covered
        the patient CSN
        """
        if df is None:
            df = self.df

        df = (df
            .assign(was_covered_dr=df.apply(lambda x: self.compute_was_not_covered(x), axis=1))
            .assign(was_covered_ip=df.apply(lambda x: self.compute_was_not_covered(x, 
                                            decision_column='ip_prescription'),
                                            axis=1))
        )

        clin_miss_rate = df['was_covered_dr'].sum() / len(df)
        ip_miss_rate = df['was_covered_ip'].sum() / len(df)
        
        return clin_miss_rate, ip_miss_rate

    def solve_and_assign(self):

        # Predictions string
        predictions_string = '%s_predictions'
        abx_model = LpProblem("Antibiotics", LpMinimize)

        # Create binary indicators for whether treatment is used
        drug_inds = {}
        for abx in self.abx_options:
            drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                              for i in range(len(self.df))]

        # Add objective function to model
        per_csn_sum = []
        for i in range(len(self.df)):
            _sum = 0
            for abx in self.abx_options:
                _sum += drug_inds[abx][i] * self.df[predictions_string % abx].values[i]
            per_csn_sum.append(_sum)
            
        abx_model += lpSum(per_csn_sum)

        # Add one selection constraint
        for i in range(len(self.df)):
            selections = []
            for abx in self.abx_options:
                selections.append(drug_inds[abx][i])
            abx_model += lpSum(selections) == 1

        for drug in drug_inds:
            abx_model += lpSum([drug_inds[drug][i] for i in range(len(self.df))]) == self.abx_settings[drug]

        # Solve model
        abx_model.solve()

        # Save selected antibiotic to df
        abx_decisions = []
        for i in range(len(self.df)):
            abx_decision = None
            for abx in self.abx_options:
                if drug_inds[abx][i].value() == 1:
                    abx_decision = abx
            assert abx_decision is not None
            abx_decisions.append(abx_decision)
        self.df['ip_prescription'] = abx_decisions


In [75]:
(df_test
.groupby('prescription')
.agg(num_allocations=('example_id', 'nunique'))
)

Unnamed: 0_level_0,num_allocations
prescription,Unnamed: 1_level_1
CIP,1282
LVX,41
NIT,1358
SXT,1260


In [83]:
abx_settings = {
    'CIP' : 1282, # 1282
    'LVX' : 41, 
    'NIT' : 1358,
    'SXT' : 1260
}
opt = AbxDecisionMaker(df_test, abx_settings)
opt.solve_and_assign()

In [84]:
clin_miss_rate, ip_miss_rate = opt.get_coverage_rates()
print(f"Clinician miss rate: {clin_miss_rate}")
print(f"Optimized miss rate: {ip_miss_rate}")

Clinician miss rate: 0.11925907130170008
Optimized miss rate: 0.09642222786094899


In [85]:
round((clin_miss_rate - ip_miss_rate) / clin_miss_rate * 100, 2)

19.15

In [86]:
df_test[['CIP', 'LVX', 'NIT', 'SXT']].mean()

CIP    0.064197
LVX    0.064704
NIT    0.109617
SXT    0.195636
dtype: float64