In [22]:
# install imbalance lib
! pip install imbalanced-learn==0.5.0



In [23]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## Read in data
The first(A) is one with all variables with >5% missing values removed, the second(B) is imputed form the most recent complete data-point prior to that birth and the third(C) is imputed using mode values

Derived variables are:
- _cohort – Either 1 (born in the first deriving cohort) or 0 (in the second, testing cohort)
- _hie – 1 for HIE, 0 for not
- _id
- _lapgar – 1 for a low Apgar score, 0 for not
- _ne – Another measure of brain injury (not used at present)
- _neonataldeath – Not used at present
- _perinataldeath – 1 for perinatal death; 0 for not
- _resus – 1 for resus at birth, and 0 for not
- _stillborn – Not used at present
- _yearofbirth -  Year of birth

First letter is either a (antenatal), g (growth) or I (intrapartum) variable
Second letter is type of entry; c (categorical), o(ordinal) or l(linear)
Then _NAME (most have one given)
Then _#### - number of were extraction was performed on the [Variable File]("3. Index_Variable File_304.2ADV3A.pdf")

In [24]:
# read in data from DO
dat = pd.read_stata("data/1_2_3_4A._Done.dta")

In [25]:
# collect cat cols
categorical = []
for col in dat.columns:
    if col[0] == "_":
        continue
    if col[1] == "c":
        categorical.append(col)

# convert unordered categorical to dummy
for c in categorical:
    one_hot = pd.get_dummies(dat[c], prefix=c)
    dat = pd.concat([dat, one_hot], axis=1)
    dat = dat.drop(c, axis=1)

In [26]:
# sep cols
antenatal = []
antenatal_growth = []
antenatal_intrapartum = []
categorical = []
ordinal = []
linear = []

for col in dat.columns:
    if col[0] == "_":
        continue
    if col[0] == "a":
        antenatal.append(col)
        antenatal_growth.append(col)
        antenatal_intrapartum.append(col)
    if col[0] == "g":
        antenatal_growth.append(col)
    if col[0] == "i":
        antenatal_intrapartum.append(col)
    if col[1] == "c":
        categorical.append(col)
    if col[1] == "o":
        ordinal.append(col)
    if col[1] == "l":
        linear.append(col)

In [27]:
# split test and train
test = dat[dat['_cohort'] == 0]
train = dat[dat['_cohort'] == 1]

## get mean and SD for **training** dataset to standardise variables (where needed)
desc = train[linear].describe()
means = np.array(desc.T['mean'])
stds = np.array(desc.T['std'])

def split_data(df, x_cols, y_col):
    x = df[x_cols + [y_col]]
    x = x.dropna(axis='index')
    y = x.pop(y_col)
    return x, y

def resample(train_x, train_y):
    # over sample minority class
    train_x_resampled, train_y_resampled = SMOTE(random_state=0).fit_resample(train_x, train_y)
    train_x_resampled = pd.DataFrame(train_x_resampled, columns=train_x.columns)
    train_y_resampled = pd.DataFrame(train_y_resampled, columns=[train_y.name])
    
    return train_x_resampled, train_y_resampled

def rf_feature_select_threshold(X, y, threshold=0.01):
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    clf = clf.fit(X.values, y.values.ravel())

    fi = pd.DataFrame(data={'predictor' : X.columns, 'feature_importance': clf.feature_importances_})
    print(fi.sort_values('feature_importance', ascending=False).head(10))

    return X.columns[clf.feature_importances_ > threshold]

def process_data(df, numeric_features, means, stds):
    
    # normalise continuous variables
    for i, f in enumerate(numeric_features):
        if f in df.columns:
            df[f] = (df[f] - means[i]) / stds[i]
        
    return df

def get_feature_importance(pred, out):
    # fit RF with all variables using five-fold CV
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    scores = cross_val_score(clf, pred.to_numpy(), out.to_numpy(), cv=5, scoring='roc_auc')
    
    # get feature importance measures
    clf.fit(pred, out.to_numpy())
    fi = pd.DataFrame(data={'predictor' : pred.columns, 'feature_importance': clf.feature_importances_})
    
    return fi

In [28]:
for name, data in {"antenatal" : antenatal, "antenatal_growth" : antenatal_growth, "antenatal_intrapartum" : antenatal_intrapartum}.items():    
    for outcome in ['_hie', '_lapgar', '_perinataldeath', '_resus']:
        
        print("Working on {} for {}".format(name, outcome))
        
        ## split
        train_x, train_y = split_data(train, data, outcome)
        test_x, test_y = split_data(test, data, outcome)

        # resample
        train_x, train_y = resample(train_x, train_y)

        ## feature selection using RF
        keep = rf_feature_select_threshold(train_x, train_y)
        train_x = train_x[keep]
        test_x = test_x[keep]
        
        # normalise continuous values
        train_x = process_data(train_x, linear, means, stds)
        test_x = process_data(test_x, linear, means, stds)

        ## write to csv
        pd.concat([train_x, train_y], axis=1).to_csv("data/{}{}_train.csv".format(name, outcome), header=True)
        pd.concat([test_x, test_y], axis=1).to_csv("data/{}{}_test.csv".format(name, outcome), header=True)

Working on antenatal for _hie
                        predictor  feature_importance
813       ac_mabnormalhr_0495_1.0            0.030856
1628  ac_presentationcat_1324_1.0            0.023957
1675               ac_race_0303_1            0.018321
408               ac_fhx_0433_0.0            0.016822
399               ac_fhx_0432_0.0            0.012723
812       ac_mabnormalhr_0495_0.0            0.012607
854       ac_mbloodgroup_0344_1.0            0.012422
1693              ac_sex_0554_1.0            0.011449
1588    ac_presentation_1321_11.0            0.011108
820       ac_mbirthplace_0278_1.0            0.010989
Working on antenatal for _lapgar
                         predictor  feature_importance
1588     ac_presentation_1321_11.0            0.019718
854        ac_mbloodgroup_0344_1.0            0.019223
813        ac_mabnormalhr_0495_1.0            0.018526
54    ac_antenataldisease_0307_0.0            0.014581
812        ac_mabnormalhr_0495_0.0            0.013970
1628   ac_pre