In [1]:
# upgrade tensorflow
! pip install -q grpcio==1.24.3
! pip install -q tensorflow==2.0.0
! pip install -q adanet==0.8.0

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
SEED=123

import pandas as pd
import numpy as np
np.random.seed(SEED)
from sklearn.metrics import brier_score_loss, roc_curve, auc
import tensorflow as tf
tf.random.set_seed(SEED)
tf.get_logger().setLevel('ERROR')
import adanet

In [3]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=64):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000, seed=SEED)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

In [4]:
def get_metrics(dat):
    pred_dicts = list(dat)
    probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
    
    # calculate roc auc metric
    fpr, tpr, thresholds = roc_curve(test_y, probs)
    roc_auc = auc(fpr, tpr)
    
    # calculate brier loss for probability accuracy
    brier_loss = brier_score_loss(test_y, probs)
            
    print("ROC AUC: {}\nBrier loss: {}".format(np.round(roc_auc, 3), np.round(brier_loss, 3)))

In [5]:
for data in ["antenatal", "antenatal_growth", "antenatal_intrapartum"]:    
    for outcome in ['_hie', '_lapgar', '_perinataldeath', '_resus']:
        
        print("Running adanet on {} for {}".format(data, outcome))
        
        # read in data
        train = pd.read_csv("data/{}{}_train.csv".format(data, outcome), index_col=0).astype('float32')
        test = pd.read_csv("data/{}{}_test.csv".format(data, outcome), index_col=0).astype('float32')
        train_y = train.pop(outcome)
        test_y = test.pop(outcome)
        
        # record feature columns
        feature_columns = []
        
        for feature_name in train.columns:
            feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
        
        train_input_fn = make_input_fn(train, train_y)
        eval_input_fn = make_input_fn(test, test_y, num_epochs=1, shuffle=False)

        # linear classifier
        print("Linear classifier")
        clf = tf.estimator.LinearClassifier(
            feature_columns=feature_columns
        )
        clf.train(train_input_fn)
        get_metrics(clf.predict(eval_input_fn))
        
        # NN
        print("NN")
        clf = tf.estimator.DNNClassifier(
            feature_columns=feature_columns, 
            hidden_units=[100, 20, 10]
        )
        clf.train(train_input_fn)
        get_metrics(clf.predict(eval_input_fn))

        # BT
        print("BT")
        clf = tf.estimator.BoostedTreesClassifier(
            feature_columns=feature_columns,
            n_batches_per_layer=150
        )
        clf.train(train_input_fn)
        get_metrics(clf.predict(eval_input_fn))
        
        # ensemble
        print("ensemble")
        head = tf.estimator.BinaryClassHead()
        clf = adanet.AutoEnsembleEstimator(
            head=head,
            candidate_pool=lambda config: {
                "linear": tf.estimator.LinearClassifier(
                    feature_columns=feature_columns,
                    config=config
                ),
                "nn_20": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['20']
                ),
                "nn_50": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['50']
                ),
                "nn_100": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['100']
                ),
                "nn_20_10": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['20', '10']
                ),
                "nn_50_20": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['50', '20']
                ),
                "nn_100_40": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['100', '40']
                ),
                "nn_100_40_20": tf.estimator.DNNClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    hidden_units=['100', '40', '20']
                ),
                "bt": tf.estimator.BoostedTreesClassifier(
                    feature_columns=feature_columns,
                    config=config,
                    n_batches_per_layer=150
                )
            },
            max_iteration_steps=100
        )

        clf.train(train_input_fn, steps=100)
        get_metrics(clf.predict(eval_input_fn))

Running adanet on antenatal for _hie
Linear classifier
ROC AUC: 0.655
Brier loss: 0.659
NN
ROC AUC: 0.628
Brier loss: 0.291
BT
ROC AUC: 0.56
Brier loss: 0.11
ensemble
ROC AUC: 0.692
Brier loss: 0.005
Running adanet on antenatal for _lapgar
Linear classifier
ROC AUC: 0.537
Brier loss: 0.838
NN
ROC AUC: 0.57
Brier loss: 0.232
BT
ROC AUC: 0.483
Brier loss: 0.118
ensemble
ROC AUC: 0.594
Brier loss: 0.028
Running adanet on antenatal for _perinataldeath
Linear classifier
ROC AUC: 0.617
Brier loss: 0.826
NN
ROC AUC: 0.648
Brier loss: 0.332
BT
ROC AUC: 0.494
Brier loss: 0.171
ensemble
ROC AUC: 0.71
Brier loss: 0.012
Running adanet on antenatal for _resus
Linear classifier
ROC AUC: 0.506
Brier loss: 0.86
NN
ROC AUC: 0.524
Brier loss: 0.251
BT
ROC AUC: 0.481
Brier loss: 0.17
ensemble
ROC AUC: 0.615
Brier loss: 0.048
Running adanet on antenatal_growth for _hie
Linear classifier
ROC AUC: 0.669
Brier loss: 0.536
NN
ROC AUC: 0.659
Brier loss: 0.188
BT
ROC AUC: 0.585
Brier loss: 0.137
ensemble
ROC AU