# Predicting Hypoxic Ischemic Encephalopathy from Collaborative Perinatal Project using ML

In [95]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from sklearn.metrics import brier_score_loss, roc_curve, auc
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(style="white")

ModuleNotFoundError: No module named 'tensorflow'

In [96]:
def get_feature_importance(pred, out):
    # fit RF with all variables using five-fold CV
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    scores = cross_val_score(clf, pred, out.values.ravel(), cv=5, scoring='roc_auc')
    
    # get feature importance measures
    clf.fit(pred, out.values.ravel())
    fi = pd.DataFrame(data={'predictor' : pred.columns, 'feature_importance': clf.feature_importances_})
    
    return fi

In [84]:
def fit(clf, x_train, y_train, x_test, y_test):   
    # train model
    clf.fit(x_train, y_train)
    
    # calculate probabilities for test data
    y_test_pred = clf.predict_proba(x_test)[:, 1]
    
    # calculate roc auc metric
    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    
    # calculate brier loss for probability accuracy
    brier_loss = brier_score_loss(y_test, y_test_pred)
    
    print("ROC AUC: {}\nBrier loss: {}".format(np.round(roc_auc, 3), np.round(brier_loss, 3)))
    
    return y_test, y_test_pred

In [None]:
def fit_nn(model, x_train, y_train, x_test, y_test):
    print("x_train n={} y_train n={} x_test n={} y_test n={}".format(len(x_train), len(y_train), len(x_test), len(y_test)))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    
    # train model
    model.fit(x_train, y_train, epochs=15)
    
    # predict test probabilities
    y_test_pred = model.predict(x_test)
    
    # calculate roc auc metric
    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    
    # calculate brier loss for probability accuracy
    brier_loss = brier_score_loss(y_test, y_test_pred)
    
    print("ROC AUC: {}\nBrier loss: {}".format(np.round(roc_auc, 3), np.round(brier_loss, 3)))
    
    return y_test_pred

In [85]:
def recursive_feature_elimination_rf(train_x, train_y, test_x, test_y):
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    rfecv = RFECV(estimator=clf, cv=5, scoring='roc_auc', n_jobs=-1)
    rfecv.fit(train_x, train_y.values.ravel())

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (roc auc)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    # keep support variables
    train_x = train_x[train_x.columns[rfecv.support_]]
    test_x = test_x[train_x.columns[rfecv.support_]]
    
    return train_x, train_y, test_x, test_y

In [None]:
def process_data(df, numeric_features, means, stds):
    
    # normalise continuous variables
    for i, f in enumerate(numeric_features):
        df[f] = (df[f] - means[i]) / stds[i]
        
    return df.values

## Read in data
The first(A) is one with all variables with >5% missing values removed, the second(B) is imputed form the most recent complete data-point prior to that birth and the third(C) is imputed using mode values

Derived variables are:
- _cohort – Either 1 (born in the first deriving cohort) or 0 (in the second, testing cohort)
- _hie – 1 for HIE, 0 for not
- _id
- _lapgar – 1 for a low Apgar score, 0 for not
- _ne – Another measure of brain injury (not used at present)
- _neonataldeath – Not used at present
- _perinataldeath – 1 for perinatal death; 0 for not
- _resus – 1 for resus at birth, and 0 for not
- _stillborn – Not used at present
- _yearofbirth -  Year of birth

First letter is either a (antenatal), g (growth) or I (intrapartum) variable
Second letter is type of entry; c (categorical), o(ordinal) or l(linear)
Then _NAME (most have one given)
Then _#### - number of were extraction was performed on the [Variable File]("3. Index_Variable File_304.2ADV3A.pdf")

In [86]:
# read in data from DO
dat = pd.read_stata("data/1_2_3_4A._Done.dta")

In [87]:
# sep cols
antenatal = []
antenatal_growth = []
antenatal_intrapartum = []
categorical = []
ordinal = []
linear = []

for col in dat.columns:
    if col[0] == "_":
        continue
    if col[0] == "a":
        antenatal.append(col)
        antenatal_growth.append(col)
        antenatal_intrapartum.append(col)
    if col[0] == "g":
        antenatal_growth.append(col)
    if col[0] == "i":
        antenatal_intrapartum.append(col)
    if col[1] == "c":
        categorical.append(col)
    if col[1] == "o":
        ordinal.append(col)
    if col[1] == "l":
        linear.append(col)

In [88]:
# set fields correctly
outcomes = ['_hie', '_lapgar', '_perinataldeath', '_resus']
dat[categorical] = dat[categorical].astype('category')
dat[outcomes] = dat[outcomes].astype('category')

In [89]:
# split test and train
test = dat[dat['_cohort'] == 0]
train = dat[dat['_cohort'] == 1]

def split_data(df, x_cols, y_col):
    x = df[x_cols + [y_col]]
    x = x.dropna(axis='index')
    y = x.pop(y_col)
    return x, y

## Models

All(?) we need from the ML model is one which takes the outcomes (HIE (Primary outcome), Low Apgar, Perinatal Death and Resus) and builds 3 prediction models. First is only antenatal variables (a*), then next is antenatal and growth (a* and g*) and then antenatal and intrapartum (a* and i*). From each model the idea to produce a prediction score from cohort 1, and apply to cohort 0, derive an ROC/AUC score for the prediction, creating a variable containing which deciles of risk the infant is placed in (1-10) – in order to compare with the other models being developed.

### Antenatal

#### HIE

##### Feature selection

In [92]:
# split out data
train_x, train_y = split_data(train, antenatal, '_hie')
test_x, test_y = split_data(test, antenatal, '_hie')

# Recursive feature elimination using Random Forest classifier
#train_x, train_y, test_x, test_y = recursive_feature_elimination_rf(train_x, train_y, test_x, test_y)

In [91]:
test_x

Unnamed: 0,ac_alcoholism_0211,ac_antenataldisease_0305,ac_antenataldisease_0307,ac_antenataldisease_0308,ac_antenataldisease_0309,ac_antenataldisease_0310,ac_antenataldisease_0311,ac_breech_1347,ac_breech_1348,ac_breech_1349,...,ao_mses_0294,ao_msescat_0296,ao_multiple_0010,ao_mweightgain_0320,ao_personssupported_0360,ao_personssupportedcat_0362,ao_plurality_0009,ao_prenatalvisits_0055,ao_priorhospitalization_0318,ao_smoking_0052
3,0.0,0.0,0.0,0.0,4.0,0.0,1.0,6.0,6.0,0.0,...,53.0,3.0,1.0,28.0,6.0,6.0,0.0,5.0,1.0,10.0
4,0.0,4.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,...,63.0,4.0,1.0,32.0,7.0,6.0,0.0,6.0,0.0,4.0
13,0.0,1.0,5.0,0.0,0.0,0.0,1.0,6.0,6.0,0.0,...,43.0,3.0,1.0,22.0,4.0,4.0,0.0,14.0,1.0,1.0
16,0.0,4.0,0.0,0.0,4.0,0.0,0.0,6.0,6.0,0.0,...,50.0,3.0,1.0,10.0,7.0,6.0,0.0,7.0,1.0,0.0
23,0.0,0.0,0.0,0.0,4.0,0.0,0.0,6.0,6.0,0.0,...,63.0,4.0,1.0,18.0,4.0,4.0,0.0,12.0,1.0,20.0
43,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,6.0,0.0,...,53.0,3.0,1.0,16.0,7.0,6.0,0.0,12.0,1.0,0.0
47,0.0,2.0,6.0,0.0,0.0,2.0,1.0,6.0,6.0,0.0,...,35.0,2.0,1.0,15.0,6.0,6.0,0.0,12.0,4.0,60.0
80,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,6.0,0.0,...,37.0,2.0,1.0,9.0,5.0,5.0,0.0,9.0,2.0,0.0
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,...,53.0,3.0,1.0,16.0,4.0,4.0,0.0,13.0,0.0,0.0
119,0.0,2.0,0.0,0.0,6.0,0.0,0.0,6.0,6.0,0.0,...,43.0,3.0,1.0,30.0,3.0,3.0,0.0,17.0,3.0,3.0


In [None]:
# plot top 50 feature importance measures for variables that contribute to the model
fi = get_feature_importance(train_x, train_y)
ax = sns.barplot(y='predictor', x="feature_importance", data=fi.sort_values('feature_importance', ascending=False).head(50)) 

##### Logistic regression

In [None]:
# evaluate model
clf = LogisticRegression(random_state=0, solver='lbfgs')
fit(clf, x_train, y_train, x_test, y_test)

##### Random forest

In [None]:
# evaluate model
clf = RandomForestClassifier(random_state=0, n_estimators=100)
fit(clf, x_train, y_train, x_test, y_test)

##### Neural net

In [None]:
# evaluate model

## Low Apgar

## Perinatal Death

## Resus