In [1]:
# install imbalance lib
! pip install -q imbalanced-learn==0.5.0

In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Read in data
The first(A) is one with all variables with >5% missing values removed, the second(B) is imputed form the most recent complete data-point prior to that birth and the third(C) is imputed using mode values

Derived variables are:
- _cohort – Either 1 (born in the first deriving cohort) or 0 (in the second, testing cohort)
- _hie – 1 for HIE, 0 for not
- _id
- _lapgar – 1 for a low Apgar score, 0 for not
- _ne – Another measure of brain injury (not used at present)
- _neonataldeath – Not used at present
- _perinataldeath – 1 for perinatal death; 0 for not
- _resus – 1 for resus at birth, and 0 for not
- _stillborn – Not used at present
- _yearofbirth -  Year of birth

First letter is either a (antenatal), g (growth) or I (intrapartum) variable
Second letter is type of entry; c (categorical), o(ordinal) or l(linear)
Then _NAME (most have one given)
Then _#### - number of were extraction was performed on the [Variable File]("3. Index_Variable File_304.2ADV3A.pdf")

In [3]:
def split_data(df, x_cols, y_col):
    x = df[x_cols + [y_col]]
    x = x.dropna(axis='index')
    y = x.pop(y_col)
    return x, y

def resample(train_x, train_y):
    # over sample minority class
    train_x_resampled, train_y_resampled = SMOTE(random_state=0).fit_resample(train_x, train_y)
    train_x_resampled = pd.DataFrame(train_x_resampled, columns=train_x.columns)
    train_y_resampled = pd.DataFrame(train_y_resampled, columns=[train_y.name])
    
    return train_x_resampled, train_y_resampled

def rf_feature_select_threshold(X, y, threshold=0.01):
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    clf = clf.fit(X.values, y.values.ravel())
    fi = pd.DataFrame(data={'predictor' : X.columns, 'feature_importance': clf.feature_importances_})
    return X.columns[clf.feature_importances_ > threshold]

def standardize_continuous_values(df, continuous_features, means, stds):
    for i, f in enumerate(continuous_features):
        if f in df.columns:
            df[f] = (df[f] - means[i]) / stds[i]
    return df

In [4]:
# read in data from DO
dat = pd.read_stata("data/1_2_3_4A._Done.dta")

In [5]:
# collect cat cols
categorical = []
for col in dat.columns:
    if col[0] == "_":
        continue
    if col[1] == "c":
        categorical.append(col)

# convert unordered categorical to dummy
for c in categorical:
    one_hot = pd.get_dummies(dat[c], prefix=c)
    dat = pd.concat([dat, one_hot], axis=1)
    dat = dat.drop(c, axis=1)

In [6]:
# sep cols
antenatal = []
antenatal_growth = []
antenatal_intrapartum = []
categorical = []
ordinal = []
linear = []

for col in dat.columns:
    if col[0] == "_":
        continue
    if col[0] == "a":
        antenatal.append(col)
        antenatal_growth.append(col)
        antenatal_intrapartum.append(col)
    if col[0] == "g":
        antenatal_growth.append(col)
    if col[0] == "i":
        antenatal_intrapartum.append(col)
    if col[1] == "c":
        categorical.append(col)
    if col[1] == "o":
        ordinal.append(col)
    if col[1] == "l":
        linear.append(col)

In [7]:
# split test and train
test = dat[dat['_cohort'] == 0]
train = dat[dat['_cohort'] == 1]

## get mean and SD for **training** dataset to standardise variables
desc = train[linear + ordinal].describe()
means = np.array(desc.T['mean'])
stds = np.array(desc.T['std'])

In [None]:
for name, variable_list in {"antenatal" : antenatal, "antenatal_growth" : antenatal_growth, "antenatal_intrapartum" : antenatal_intrapartum}.items():
    for outcome in ['_hie', '_lapgar', '_perinataldeath', '_resus']:
        print("Working on {} for {}".format(name, outcome))
        
        # select variables for this analysis
        train_x, train_y = split_data(train, variable_list, outcome)
        test_x, test_y = split_data(test, variable_list, outcome)
        
        # resample the minor class
        train_x, train_y = resample(train_x, train_y)
        
        # identify highly correlated features
        corr_matrix = train_x.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
        
        # drop one highly correlated feature from a pair
        print("Dropping correlated features: {}".format(to_drop))
        train_x = train_x.drop(to_drop, axis=1)
        test_x = test_x.drop(to_drop, axis=1)

        # feature selection using random forest
        keep = rf_feature_select_threshold(train_x, train_y)
        print("Selected features: {}".format(list(keep)))
        train_x = train_x[keep]
        test_x = test_x[keep]
        
        # standardize continuous values
        train_x = standardize_continuous_values(train_x, linear + ordinal, means, stds)
        test_x = standardize_continuous_values(test_x, linear + ordinal, means, stds)

        # write to csv
        pd.concat([train_x, train_y], axis=1).to_csv("data/{}{}_train.csv".format(name, outcome), header=True)
        pd.concat([test_x, test_y], axis=1).to_csv("data/{}{}_test.csv".format(name, outcome), header=True)

Working on antenatal for _hie
Dropping correlated features: ['al_magecat_0033', 'al_mweightfinal_0341', 'ao_fagecat_0376', 'ao_mdbp_0337', 'ao_meducationcat_0272', 'ao_mhctlow_0092', 'ao_mincomecat_0275', 'ao_msbp_0332', 'ao_msescat_0296', 'ao_mweightgain_0320', 'ao_personssupported_0360', 'ao_personssupportedcat_0362', 'ao_plurality_0009', 'ac_antenataldisease_0308_8.0', 'ac_antenataldisease_0309_8.0', 'ac_antenataldisease_0310_0.0', 'ac_antenataldisease_0310_8.0', 'ac_breech_1347_7.0', 'ac_breech_1348_0.0', 'ac_breech_1348_1.0', 'ac_breech_1348_6.0', 'ac_breech_1348_7.0', 'ac_breech_1350_0.0', 'ac_breech_1351_0.0', 'ac_breech_1351_1.0', 'ac_breech_1352_0.0', 'ac_breech_1352_7.0', 'ac_breech_1352_8.0', 'ac_cigperday_0054_1.0', 'ac_consang_0287_8.0', 'ac_consang_0288_0.0', 'ac_consang_0288_8.0', 'ac_consang_0290_0.0', 'ac_consang_0292_0.0', 'ac_consang_0292_1.0', 'ac_fatherathome_0373_0.0', 'ac_fatherathome_0373_1.0', 'ac_fatherathome_0381_0.0', 'ac_fatherathome_0381_1.0', 'ac_fhneurol