In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

In [2]:
df_all = pd.read_csv("../data/tia-ed-sel-imp-2020-01-09.csv", index_col=0)

In [3]:
df_all.head()

Unnamed: 0,my_outcome,adj_outcome_is7day,adj_carotidoutcome_is7day,sex_female,age,temperature,hr_rate,sbp,dbp,sa02,...,my_vertigo_syncope,my_lang_speech,my_afib,img_abn_l,img_abn_r,uni_weakness_l,uni_weakness_r,aphasia,peter_flag,learn
0,0,0,0.0,1,53.0,36.299988,70.0,121.0,79.0,99.0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0.0,1,81.0,36.5,57.0,215.0,55.0,98.0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0.0,0,45.0,36.599976,66.0,151.0,95.0,98.0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0.0,0,54.0,35.19998,108.0,123.0,89.0,97.5,...,0,1,0,0,0,0,0,1,0,1
4,0,0,0.0,1,74.0,36.799988,110.0,119.0,79.0,98.0,...,0,1,1,0,0,0,1,0,0,1


In [4]:
# Remove unused decision variables
df_all.drop(columns=['adj_outcome_is7day', 'adj_carotidoutcome_is7day'], inplace=True)

In [5]:
df_all['dursymptoms_encoded'] = df_all.dursymptoms.map({'lt_1min': 0, '1_5min': 1, '5_9min': 2, '10_29min': 3, '30_59min': 4, 'ge_60min': 5})
df_all[['dursymptoms', 'dursymptoms_encoded']].drop_duplicates()

Unnamed: 0,dursymptoms,dursymptoms_encoded
0,ge_60min,5
1,5_9min,2
7,10_29min,3
10,1_5min,1
12,30_59min,4
63,lt_1min,0


In [6]:
# Now remove the old dursymptoms columns and rename dursymptoms_encoded to dursymptoms
df_all.drop(columns=['dursymptoms'], inplace=True)
df_all.rename(columns={'dursymptoms_encoded': 'dursymptoms'}, inplace=True)

In [7]:
# We also change the type of inittia_numpast from float to int
df_all.inittia_numpast = pd.to_numeric(df_all.inittia_numpast, downcast='integer')

In [8]:
# Introduce dummy variables
df_all_encoded = pd.get_dummies(df_all, drop_first=True)

In [9]:
# Split the data into learning and training subsets
df_train = df_all_encoded[df_all_encoded.learn == 1].drop(columns=['learn'])
df_test = df_all_encoded[df_all_encoded.learn == 0].drop(columns=['learn'])

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [11]:
X_train = df_train.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
X_test = df_test.iloc[:, 1:]
y_test = df_test.iloc[:, 0]

In [12]:
X_train.head()

Unnamed: 0,sex_female,age,temperature,hr_rate,sbp,dbp,sa02,wbcvalue,hgbvalue,pltvalue,...,med_clop_already_taken,med_clop_discont_ed,med_clop_started_ed,med_stat_discont_ed,med_stat_started_ed,med_anti_discont_ed,med_anti_started_ed,med_coum_already_taken,med_coum_discont_ed,med_coum_started_ed
0,1,53.0,36.299988,70.0,121.0,79.0,99.0,8.0,151.0,238.0,...,0,0,0,0,0,0,0,0,0,0
1,1,81.0,36.5,57.0,215.0,55.0,98.0,8.3,144.0,293.0,...,0,0,0,0,1,0,0,0,0,0
2,0,45.0,36.599976,66.0,151.0,95.0,98.0,10.3,162.0,316.0,...,0,0,0,0,0,0,0,0,0,0
3,0,54.0,35.19998,108.0,123.0,89.0,97.5,6.6,153.0,265.0,...,0,0,0,0,0,0,1,0,0,0
4,1,74.0,36.799988,110.0,119.0,79.0,98.0,7.9,153.0,463.0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df_train.columns.values

array(['my_outcome', 'sex_female', 'age', 'temperature', 'hr_rate', 'sbp',
       'dbp', 'sa02', 'wbcvalue', 'hgbvalue', 'pltvalue', 'creatinevalue',
       'glucosevalue', 'ckvalue', 'tntvalue', 'pmedhis_hyp',
       'pmedhis_cad', 'pmedhis_af', 'pmedhis_pvd', 'pmedhis_diab',
       'pmedhis_kps', 'pmedhis_smoker', 'pmedhis_cs', 'pmedhis_chf',
       'pmedhis_hchol', 'pmedhis_dem', 'pmedhis_vhd',
       'med_ibup_last_7days', 'my_infarct', 'inittia_numpast',
       'my_sensation', 'my_weakness', 'my_gait', 'my_vertigo_syncope',
       'my_lang_speech', 'my_afib', 'img_abn_l', 'img_abn_r',
       'uni_weakness_l', 'uni_weakness_r', 'aphasia', 'peter_flag',
       'dursymptoms', 'my_ecgtype_afib', 'my_ecgtype_afl',
       'my_ecgtype_conduction_abn', 'my_ecgtype_non_specific',
       'my_ecgtype_old_infarct', 'my_ecgtype_pace_rhythm',
       'my_ecgtype_sinus_rhythm', 'med_asa_discont_ed',
       'med_asa_started_ed', 'med_dipy_already_taken',
       'med_dipy_discont_ed', 'med_dipy_sta

In [14]:
df_test.columns

Index(['my_outcome', 'sex_female', 'age', 'temperature', 'hr_rate', 'sbp',
       'dbp', 'sa02', 'wbcvalue', 'hgbvalue', 'pltvalue', 'creatinevalue',
       'glucosevalue', 'ckvalue', 'tntvalue', 'pmedhis_hyp', 'pmedhis_cad',
       'pmedhis_af', 'pmedhis_pvd', 'pmedhis_diab', 'pmedhis_kps',
       'pmedhis_smoker', 'pmedhis_cs', 'pmedhis_chf', 'pmedhis_hchol',
       'pmedhis_dem', 'pmedhis_vhd', 'med_ibup_last_7days', 'my_infarct',
       'inittia_numpast', 'my_sensation', 'my_weakness', 'my_gait',
       'my_vertigo_syncope', 'my_lang_speech', 'my_afib', 'img_abn_l',
       'img_abn_r', 'uni_weakness_l', 'uni_weakness_r', 'aphasia',
       'peter_flag', 'dursymptoms', 'my_ecgtype_afib', 'my_ecgtype_afl',
       'my_ecgtype_conduction_abn', 'my_ecgtype_non_specific',
       'my_ecgtype_old_infarct', 'my_ecgtype_pace_rhythm',
       'my_ecgtype_sinus_rhythm', 'med_asa_discont_ed', 'med_asa_started_ed',
       'med_dipy_already_taken', 'med_dipy_discont_ed', 'med_dipy_started_ed',
    

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [17]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3891 entries, 3954 to 11507
Data columns (total 65 columns):
my_outcome                   3891 non-null int64
sex_female                   3891 non-null int64
age                          3891 non-null float64
temperature                  3891 non-null float64
hr_rate                      3891 non-null float64
sbp                          3891 non-null float64
dbp                          3891 non-null float64
sa02                         3891 non-null float64
wbcvalue                     3891 non-null float64
hgbvalue                     3891 non-null float64
pltvalue                     3891 non-null float64
creatinevalue                3891 non-null float64
glucosevalue                 3891 non-null float64
ckvalue                      3891 non-null float64
tntvalue                     3891 non-null float64
pmedhis_hyp                  3891 non-null int64
pmedhis_cad                  3891 non-null int64
pmedhis_af                   3

In [19]:

numeric_features = X_train.select_dtypes(include=['int64', 'float64', 'uint8', 'int8']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [20]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score

from sklearn.metrics import auc, roc_curve, roc_auc_score

import pickle

from sklearn.compose import ColumnTransformer

In [22]:
samplers = {
#     "None": NoneSampler(),
    "RU": RandomUnderSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
#     "BSMOTE": BorderlineSMOTE(random_state=42),
#     "SMOTEENN": SMOTEENN(random_state = 42),
#     "NCR": NeighbourhoodCleaningRule(),
#     "RO": RandomOverSampler(random_state=42),
    "ADASYN": ADASYN(random_state=42)
}

In [23]:
classifiers = { 
#     "1NN": KNeighborsClassifier(1), 
#     "3NN": KNeighborsClassifier(3), 
#     "AB": AdaBoostClassifier(random_state=42),
#     "DT-G": DecisionTreeClassifier(random_state=42, max_depth=10),
#     "DT-H": DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=10),
    "LR": LogisticRegression(random_state=42, solver='lbfgs', max_iter=500),
    "RF": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5),
#     "SVC": SVC(random_state=42, probability=True, gamma='auto'),
#     "LSVC": SVC(random_state=42, kernel='linear', probability=True, class_weight='balanced'),
#     "XGB": XGBClassifier(random_state=42)
}

In [24]:
def validate(name, clf, X_test, y_test, X_train = None, y_train = None):
    """Fits and evaluates a classifier on a holdout sample"""
    if not (X_train is None or y_train is None):
        clf.fit(X_train, y_train)
    pickle.dump(clf, open(f'{name}test.sav', 'wb'))
    y_prob = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))

In [25]:
results = []
for classifier_label, classifier in classifiers.items(): 
    print(classifier_label)
    if not (X_train is None or y_train is None):
        rf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])
        rf.fit(X_train, y_train)
    

LR
RF


In [33]:
group1 = ['pmedhis_hyp', 'pmedhis_cad', 'pmedhis_af', 'pmedhis_pvd', 'pmedhis_diab', 'pmedhis_kps', 'pmedhis_smoker', 'pmedhis_cs', 'pmedhis_chf', 'pmedhis_hchol', 'pmedhis_dem', 'pmedhis_vhd', 'med_ibup_last_7days', 'my_infarct', 'inittia_numpast']
group1removed = ['med_clop', 'med_dipy', 'med_stat', 'med_anti', 'med_asa', 'med_coum']
group2 = ['temperature', 'hr_rate', 'sbp', 'dbp', 'sa02', 'dursymptoms', 'my_sensation', 'my_weakness', 'my_gait', 'my_vertigo_syncope', 'my_lang_speech', 'my_afib', 'uni_weakness_l', 'uni_weakness_r', 'aphasia']
group3 = ['peter_flag', 'wbcvalue', 'hgbvalue', 'pltvalue', 'creatinevalue', 'glucosevalue', 'ckvalue', 'tntvalue', 'img_abn_l', 'img_abn_r']
group3removed = ['ecgtype']

X_train[list(set(X_train.columns) - set(group1))]

group1DataFrame = X_train[group1]
group2DataFrame = X_train[group1 + group2]
group3DataFrame = X_train[group1 + group2 + group3]

groups = [group1DataFrame, group2DataFrame, group3DataFrame]
test_group_1 = X_test.copy()
test_group_1[group2 + group3] = None

In [34]:
results = []
for num, group in enumerate(groups):
    best_pipeline = None
    group_best_label = ''
    group_best = 0
    for classifier_label, classifier in classifiers.items():
        for sampler_label, sampler in samplers.items():

            if not (group is None or y_train is None):

                X_over, y_over = sampler.fit_resample(group, y_train)
                
                numeric_features = group.select_dtypes(include=['int64', 'float64', 'uint8', 'int8']).columns
                categorical_features = group.select_dtypes(include=['object']).columns
                preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', numeric_transformer, numeric_features),
                        ('cat', categorical_transformer, categorical_features)])

                rf = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', classifier)])
                rf.fit(X_over, y_over)

                y_prob = rf.predict_proba(test_group_1)[:, 1]
                y_pred = rf.predict(test_group_1)
                score = accuracy_score(y_test, y_pred)
                
                if (score > group_best):
                    group_best_label = f'{num} {classifier_label} {sampler_label} {score}'
                    best_pipeline = rf
    print (group_best_label)
    pickle.dump(rf, open(f'{group_best_label}.sav', 'wb'))



0 RF ADASYN 0.4479568234387047




1 RF ADASYN 0.32099717296324853




2 RF ADASYN 0.34489848368028786




In [132]:
rf['preprocessor'].transformers_[0][2]

Index(['sex_female', 'age', 'temperature', 'hr_rate', 'sbp', 'dbp', 'sa02',
       'wbcvalue', 'hgbvalue', 'pltvalue', 'creatinevalue', 'glucosevalue',
       'ckvalue', 'tntvalue', 'pmedhis_hyp', 'pmedhis_cad', 'pmedhis_af',
       'pmedhis_pvd', 'pmedhis_diab', 'pmedhis_kps', 'pmedhis_smoker',
       'pmedhis_cs', 'pmedhis_chf', 'pmedhis_hchol', 'pmedhis_dem',
       'pmedhis_vhd', 'med_ibup_last_7days', 'my_infarct', 'inittia_numpast',
       'my_sensation', 'my_weakness', 'my_gait', 'my_vertigo_syncope',
       'my_lang_speech', 'my_afib', 'img_abn_l', 'img_abn_r', 'uni_weakness_l',
       'uni_weakness_r', 'aphasia', 'peter_flag', 'dursymptoms',
       'my_ecgtype_afib', 'my_ecgtype_afl', 'my_ecgtype_conduction_abn',
       'my_ecgtype_non_specific', 'my_ecgtype_old_infarct',
       'my_ecgtype_pace_rhythm', 'my_ecgtype_sinus_rhythm',
       'med_asa_discont_ed', 'med_asa_started_ed', 'med_dipy_already_taken',
       'med_dipy_discont_ed', 'med_dipy_started_ed', 'med_clop_already_

In [133]:
test = X_test.copy()
test['peter_flag'] = None
print(test.iloc[[13]].shape)
rf.predict_proba(test.iloc[[13]])

(1, 64)


array([[0.66368077, 0.33631923]])

In [85]:
a = np.full(64, dtype=float, fill_value=0.0)

In [86]:
a.shape

(64,)

In [89]:
rf.predict_proba([a])

ValueError: Specifying the columns using strings is only supported for pandas DataFrames