In [1]:
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np

import os

from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
from lifelines import CoxPHFitter, AalenAdditiveFitter, WeibullAFTFitter

import smote_variants as sv

from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
features = []
for file in tqdm_notebook(os.listdir('examples/output')):
    features.append(pd.read_csv('examples/output/{}'.format(file), index_col=0))
features = pd.concat(features)

clin_features = ['id', 'channel', 'RecID', 'Gestation', 'Rectime', 'Age', 'Parity', 
                 'Abortions', 'Weight', 'Hypertension', 'Diabetes', 'Placental_position', 
                 'Bleeding_first_trimester', 'Bleeding_second_trimester', 'Funneling', 'Smoker']
features['Gestation'] = features['Gestation'].astype(float)
features['Rectime'] = features['Rectime'].astype(float)
features['TimeToBirth'] = features['Gestation'] - features['Rectime']

HBox(children=(IntProgress(value=0, max=298), HTML(value='')))




In [3]:
# Create a feature matrix & multiple target vectors (preterm <-> term, ttb < 7 <-> ttb >= 7, ...)
features[['Gestation', 'Rectime', 'Age', 'Parity', 'Abortions', 'Weight']] = features[['Gestation', 'Rectime', 'Age', 'Parity', 'Abortions', 'Weight']].replace(to_replace='None', value=np.NaN)

ids = set(features['id'])
channels = set(features['channel'])
joined_features = []
for _id in tqdm_notebook(ids):
    features_id = []
    features_filtered = features[features['id'] == _id]
    for channel in channels:
        channel_features = features_filtered[features_filtered['channel'] == channel]
        col_map = {}
        for col in channel_features:
            if col not in clin_features:
                col_map[col] = '{}_ch{}'.format(col, channel)
        channel_features = channel_features.rename(columns=col_map)
        features_id.append(channel_features)
    features_id = pd.concat(features_id, axis=1)
    joined_features.append(features_id)
joined_features = pd.concat(joined_features)
joined_features = joined_features.loc[:,~joined_features.columns.duplicated()]

joined_features = pd.get_dummies(joined_features, columns=['Hypertension', 'Diabetes', 'Placental_position', 'Bleeding_first_trimester', 'Bleeding_second_trimester', 'Funneling', 'Smoker'])
for col in ['Gestation', 'Rectime', 'Age', 'Parity', 'Abortions', 'Weight']:
    joined_features[col] = joined_features[col].fillna(joined_features[col].median())
    
for col in joined_features.columns[joined_features.isnull().sum() > 0]:
    joined_features[col] = joined_features[col].fillna(joined_features[col].mean())
    

term_preterm = joined_features['Gestation'] <= 37
ttb_10w = joined_features['TimeToBirth_ch3'] <= 10
ttb = joined_features['TimeToBirth_ch3']
feature_matrix = joined_features.drop(['TimeToBirth_ch3', 'TimeToBirth_ch2', 'TimeToBirth_ch1', 'Gestation', 
                                       'RecID', 'channel', 'id'], axis=1)

print(features.shape, joined_features.shape)

HBox(children=(IntProgress(value=0, max=298), HTML(value='')))


(894, 2125) (298, 6357)


In [4]:
# Util classes & functions for feature selection

def get_corr_features(X):
    """Get all coordinates in the X-matrix with correlation value equals 1
    (columns with equal values), excluding elements on the diagonal.

    Parameters:
    -----------
    - train_df: pd.DataFrame
        the feature matrix where correlated features need to be removed

    Returns
    -------
    - correlated_feature_pairs: list of tuples
        coordinates (row, col) where correlated features can be found
    """
    row_idx, col_idx = np.where(np.abs(X.corr()) > 0.95)
    self_corr = set([(i, i) for i in range(X.shape[1])])
    correlated_feature_pairs = set(list(zip(row_idx, col_idx))) - self_corr
    return correlated_feature_pairs


def get_uncorr_features(data):
    """Remove clusters of these correlated features, until only one feature 
    per cluster remains.

    Parameters:
    -----------
    - data: pd.DataFrame
        the feature matrix where correlated features need to be removed

    Returns
    -------
    - data_uncorr_cols: list of string
        the column names that are completely uncorrelated to eachother
    """
    X_train_corr = data.copy()
    correlated_features = get_corr_features(X_train_corr)

    corr_cols = set()
    for row_idx, col_idx in correlated_features:
        corr_cols.add(row_idx)
        corr_cols.add(col_idx)

    uncorr_cols = list(set(X_train_corr.columns) - set(X_train_corr.columns[list(corr_cols)]))
   
    col_mask = [False]*X_train_corr.shape[1]
    for col in corr_cols:
        col_mask[col] = True
    X_train_corr = X_train_corr.loc[:, col_mask]
  
    correlated_features = get_corr_features(X_train_corr)
    to_remove = set()
    for corr_row, corr_col in correlated_features:
        if corr_row in to_remove or corr_col in to_remove:
            continue

        for corr_row2, corr_col2 in correlated_features:
            if corr_row == corr_row2:
                to_remove.add(corr_col2)
            elif corr_row == corr_col2:
                to_remove.add(corr_row2)

    col_mask = [True]*X_train_corr.shape[1]
    for ix in to_remove:
        col_mask[ix] = False

    X_train_corr = X_train_corr.loc[:, col_mask]

    data_uncorr_cols = list(set(list(X_train_corr.columns) + uncorr_cols))

    return data_uncorr_cols

def remove_features(data):
    """Remove all correlated features and columns with only a single value.

    Parameters:
    -----------
    - data: pd.DataFrame
        the feature matrix where correlated features need to be removed

    Returns
    -------
    - useless_cols: list of string
        list of column names that have no predictive value
    """
    single_cols = list(data.columns[data.nunique() == 1])

    uncorr_cols = get_uncorr_features(data)
    corr_cols = list(set(data.columns) - set(uncorr_cols))

    useless_cols = list(set(single_cols + corr_cols))

    print('Removing {} features'.format(len(useless_cols)))

    return useless_cols

# print(sum(feature_matrix.std(axis=0) < 0.1))
# feature_matrix = feature_matrix.loc[:, feature_matrix.std(axis=0) >= 0.1]
useless_features = remove_features(feature_matrix)

feature_matrix = feature_matrix.drop(useless_features, axis=1)
feature_matrix.shape

Removing 2773 features


(298, 3577)

In [5]:
# Util classes & functions for feature selection
class PipelineRFE(Pipeline):

    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        print(self)
#         self.feature_importances_ = self.steps[0][1].best_estimator_.steps[-1][1].classifier.coef_
        self.feature_importances = self.steps[1][1].classifier.coef_
        return self

In [6]:
X = feature_matrix.reset_index(drop=True)
y = feature_matrix['Rectime'] + ttb >= 37

In [17]:
import itertools

def tune_hyper_parameters(X, y, param_grid):
    best_params, best_features, best_score = None, None, 0
    combinations = list(itertools.product(*list(param_grid.values())))
    for combination in combinations:
        param_combination = dict(zip(param_grid.keys(), combination))
        skf = StratifiedKFold(n_splits=3, random_state=42)
        scores = []
        for fold_ix, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train = X[train_idx, :]
            X_test = X[test_idx, :]
            y_train = y.iloc[train_idx]
            y_test = y.iloc[test_idx]

            clf = sv.OversamplingClassifier(
                sv.SMOTE(proportion=param_combination['proportion'],
                         n_neighbors=param_combination['n_neighbors']),
                LogisticRegression(penalty=param_combination['penalty'],
                                   C=param_combination['C'])
            )
            clf.fit(X_train, y_train)

            prob_preds = clf.predict_proba(X_test)[:, 1]
            scores.append(roc_auc_score(y_test, prob_preds))
        
        agg_score = np.mean(scores)
        if agg_score > best_score:
            best_score = agg_score
            best_params = param_combination
            
    print(best_params, best_score)
    return best_params

import logging
logger = logging.getLogger('smote_variants')
logger.disabled = True

param_grid = {
    'C': [10**i for i in range(-3, 4)],
    'penalty': ['l1', 'l2'],
    'proportion': [0.5, 1.0, 1.5],
    'n_neighbors': [3, 5, 7],
}

skf = StratifiedKFold(n_splits=5, random_state=42)
for fold_ix, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    params = tune_hyper_parameters(X_train, y_train, param_grid)
    clf = sv.OversamplingClassifier(
        sv.SMOTE(proportion=params['proportion'],
                 n_neighbors=params['n_neighbors']),
        LogisticRegression(penalty=params['penalty'],
                           C=params['C'])
    )
    clf.fit(X_train, y_train)

    prob_preds = clf.predict_proba(X_test)[:, 1]
    preds = clf.predict(X_test)

    print('AUC = {} || Confusion Matrix ='.format(roc_auc_score(y_test, prob_preds)))
    print(confusion_matrix(y_test, preds))

{'C': 0.001, 'penalty': 'l2', 'proportion': 1.0, 'n_neighbors': 7} 0.6648378191856452
AUC = 0.6033653846153847 || Confusion Matrix =
[[ 7  1]
 [31 21]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 0.5, 'n_neighbors': 3} 0.6428226363008972
AUC = 0.6754807692307693 || Confusion Matrix =
[[ 5  3]
 [17 35]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 1.5, 'n_neighbors': 3} 0.6842443064182194
AUC = 0.4543269230769231 || Confusion Matrix =
[[ 4  4]
 [30 22]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 1.0, 'n_neighbors': 5} 0.5263943785682916
AUC = 0.8214285714285714 || Confusion Matrix =
[[ 6  1]
 [18 34]]
{'C': 10, 'penalty': 'l1', 'proportion': 1.0, 'n_neighbors': 7} 0.59648033126294
AUC = 0.32692307692307687 || Confusion Matrix =
[[ 0  7]
 [ 6 46]]


In [10]:
"""SPY
{'C': 0.01, 'penalty': 'l2', 'threshold': 0.75, 'n_neighbors': 3}
AUC = 0.44711538461538464 || Confusion Matrix =
[[ 5  3]
 [30 22]]
 
{'C': 0.001, 'penalty': 'l2', 'threshold': 0.75, 'n_neighbors': 3}
AUC = 0.65625 || Confusion Matrix =
[[ 5  3]
 [17 35]]
 
{'C': 1, 'penalty': 'l1', 'threshold': 0.3, 'n_neighbors': 7}
AUC = 0.2764423076923077 || Confusion Matrix =
[[ 1  7]
 [ 4 48]]
 
{'C': 0.1, 'penalty': 'l1', 'threshold': 0.75, 'n_neighbors': 7}
AUC = 0.7197802197802198 || Confusion Matrix =
[[ 1  6]
 [ 2 50]]
 
{'C': 1, 'penalty': 'l1', 'threshold': 0.5, 'n_neighbors': 3}
AUC = 0.38736263736263743 || Confusion Matrix =
[[ 0  7]
 [ 6 46]]

"""

SyntaxError: invalid syntax (<ipython-input-10-1a0775922923>, line 1)

In [None]:
"""SMOTE
{'C': 0.001, 'penalty': 'l2', 'proportion': 1.0, 'n_neighbors': 3}
AUC = 0.6201923076923077 || Confusion Matrix =
[[ 7  1]
 [31 21]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 0.5, 'n_neighbors': 3}
AUC = 0.65625 || Confusion Matrix =
[[ 4  4]
 [18 34]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 1.5, 'n_neighbors': 5}
AUC = 0.4567307692307692 || Confusion Matrix =
[[ 4  4]
 [29 23]]
{'C': 0.001, 'penalty': 'l2', 'proportion': 1.5, 'n_neighbors': 5}
AUC = 0.8186813186813188 || Confusion Matrix =
[[ 6  1]
 [18 34]]
{'C': 0.1, 'penalty': 'l1', 'proportion': 1.0, 'n_neighbors': 3}
AUC = 0.39010989010989017 || Confusion Matrix =
[[ 0  7]
 [ 7 45]]

"""