In [1]:
# install imbalance lib
! pip install -q imbalanced-learn==0.5.0

In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [3]:
def resample(train_x, train_y):
    # over sample minority class
    train_x_resampled, train_y_resampled = SMOTE(random_state=0).fit_resample(train_x, train_y)
    train_x_resampled = pd.DataFrame(train_x_resampled, columns=train_x.columns)
    train_y_resampled = pd.DataFrame(train_y_resampled, columns=[train_y.name])
    
    return train_x_resampled, train_y_resampled

def rf_feature_select_threshold(X, y, threshold=0.01):
    clf = RandomForestClassifier(random_state=0, n_estimators=100)
    clf = clf.fit(X.values, y.values.ravel())
    fi = pd.DataFrame(data={'predictor' : X.columns, 'feature_importance': clf.feature_importances_})
    return X.columns[clf.feature_importances_ > threshold]

def standardize_continuous_values(df, continuous_features, means, stds):
    for i, f in enumerate(continuous_features):
        if f in df.columns:
            df[f] = (df[f] - means[i]) / stds[i]
    return df

In [None]:
## get mean and SD for **training** dataset to standardise variables
desc = train[linear + ordinal].describe()
means = np.array(desc.T['mean'])
stds = np.array(desc.T['std'])

In [None]:
for name, variable_list in {"antenatal" : antenatal, "antenatal_growth" : antenatal_growth, "antenatal_intrapartum" : antenatal_intrapartum}.items():
    for outcome in ['_hie', '_lapgar', '_perinataldeath', '_resus']:
        print("Working on {} for {}".format(name, outcome))
        
        # select variables for this analysis
        train_x, train_y = split_data(train, variable_list, outcome)
        test_x, test_y = split_data(test, variable_list, outcome)
        
        # resample the minor class
        train_x, train_y = resample(train_x, train_y)
        
        # identify highly correlated features
        corr_matrix = train_x.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
        
        # drop one highly correlated feature from a pair
        print("Dropping correlated features: {}".format(to_drop))
        train_x = train_x.drop(to_drop, axis=1)
        test_x = test_x.drop(to_drop, axis=1)

        # feature selection using random forest
        keep = rf_feature_select_threshold(train_x, train_y)
        print("Selected features: {}".format(list(keep)))
        train_x = train_x[keep]
        test_x = test_x[keep]
        
        # standardize continuous values
        train_x = standardize_continuous_values(train_x, linear + ordinal, means, stds)
        test_x = standardize_continuous_values(test_x, linear + ordinal, means, stds)

        # write to csv
        pd.concat([train_x, train_y], axis=1).to_csv("data/{}{}_train.csv".format(name, outcome), header=True)
        pd.concat([test_x, test_y], axis=1).to_csv("data/{}{}_test.csv".format(name, outcome), header=True)