In [2]:
import numpy as np
import pandas as pd
# plots
import seaborn as sns; sns.set_theme()
sns.set(palette='colorblind')
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

Alexandre Dias Negretti	233609	
Daniel Godoy Marques	166213	
Gyovana Mayara Moriyama	216190	

In [3]:
path = '../data/processed/'
df_train = pd.read_csv(path + 'scenario04.csv', sep=',')
df_test = pd.read_csv(path + 'scenario03.csv', sep=',')

df = pd.concat([df_train, df_test])

Mixing scenario04 and scenario03

# Dealing with Categorical Features - Encoding

In [None]:
cols = ['organization', 'age', 'cnt_encounters', 'cts', 'contraceptive', 'anticoagulant', 'cnt_medications', 'cnt_procedures', 'last_encounter_dur', 'condition_dur', 
        'gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient', 'target']

In [None]:
df = pd.get_dummies(df, columns=['gender'], drop_first=True)
df = pd.get_dummies(df, columns=['encounterclass']) 
df = df[cols]
df.head()

Unnamed: 0,organization,age,cnt_encounters,cts,contraceptive,anticoagulant,cnt_medications,cnt_procedures,last_encounter_dur,condition_dur,gender_M,encounterclass_ambulatory,encounterclass_emergency,encounterclass_inpatient,encounterclass_outpatient,target
0,ATHOL MEMORIAL HOSPITAL,69,51,0,0,2,10,0,9,6,0,0,0,1,0,0
1,LAWRENCE GENERAL HOSPITAL,65,26,0,0,2,10,0,12,0,1,0,0,1,0,0
2,LAHEY HOSPITAL & MEDICAL CENTER BURLINGTON,32,11,0,0,2,5,0,10,0,1,0,0,1,0,0
3,UMASS MEMORIAL MEDICAL CENTER INC,53,43,0,2,2,16,0,12,2,0,0,0,1,0,0
4,EMERSON HOSPITAL -,59,25,0,0,2,9,0,9,3,1,0,0,1,0,0


# Models

For this composition we are testing three different data splits: 
* train size: 70 and test size: 630
* train size: 350 and test size 350
* train size: 630 and test size: 70
 
For each of these splits we test the same subdivisions as the other compositions.
 
we only report the last split in our report, as it is the one with better results and to avoid redundancy.

In [None]:
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> None:
  models = [
          ('LogReg', LogisticRegression(max_iter=500)), 
            ('RF', RandomForestClassifier(random_state=42)),
            ('LGBM', LGBMClassifier(random_state=42)),
            ('XGB', XGBClassifier(random_state=42))
          ]

  for name, model in models:
    print(f'{name}:')
    results = pd.DataFrame()

    start = time.time()
    clf = model.fit(X_train, y_train)
    end = time.time()
    results.loc[0,'Fit time'] = end - start

    start = time.time()
    y_pred = clf.predict(X_test)
    end = time.time()
    results.loc[0,'Score time'] = end - start
    y_proba = clf.predict_proba(X_test)

    results.loc[0,'Accuracy'] = accuracy_score(y_test.values, y_pred)
    results.loc[0, 'AUC'] = roc_auc_score(y_test.values, y_proba[:, 1])
    results.loc[0, 'F1-Score'] = f1_score(y_test.values, y_pred)
    display(results)
    print()

    if name == 'XGB':
      print()
      for feature, imp in zip(X_train.columns, clf.feature_importances_):
        print(f'{feature:<25} {imp}')


## Train size: 70 and Test size: 630


### Split

In [None]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, shuffle=True, random_state=42)

In [None]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=800)
encoder.fit(X_train[['organization']])
X_train[['organization']] = encoder.transform(X_train[['organization']])

X_test[['organization']] = encoder.transform(X_test[['organization']])

### SMOTE

In [None]:
oversample = SMOTE()
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Model Baseline

In [None]:
predictions = np.zeros(len(y_train), dtype=int)
y_prob_n = np.zeros(len(y_train), dtype=int)

print(f'Accuracy: {accuracy_score(y_train.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train.values,predictions)}')

Accuracy: 0.7428571428571429
AUC: 0.5
F1-Score: 0.0


In [None]:
predictions = np.zeros(len(y_train_over), dtype=int)
y_prob_n = np.zeros(len(y_train_over), dtype=int)

print(f'Accuracy: {accuracy_score(y_train_over.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train_over.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train_over.values,predictions)}')

Accuracy: 0.5
AUC: 0.5
F1-Score: 0.0


### Creating models


In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time

#### Imbalanced and using all features

In [None]:
run_exps(X_train, y_train, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.027309,0.003815,0.996825,0.991527,0.992366



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.16898,0.017158,0.992063,0.998114,0.981132



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.014025,0.003511,0.988889,0.992006,0.973384



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.016544,0.001452,0.996825,0.992105,0.992366




organization              0.0
age                       0.0
cnt_encounters            0.017963288351893425
cts                       0.0
contraceptive             0.0
anticoagulant             0.0
cnt_medications           0.0
cnt_procedures            0.0
last_encounter_dur        0.0
condition_dur             0.9820367097854614
gender_M                  0.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### Imbalanced and using just features from original tables


In [None]:
X_train_og = X_train[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_og, y_train, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.005844,0.001249,0.790476,0.565832,0.0



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.137681,0.021655,0.790476,0.565953,0.0



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.011453,0.003549,0.790476,0.566311,0.0



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.012109,0.001331,0.790476,0.566311,0.0




gender_M                  1.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### Imbalanced and removing condition_dur feature

In [None]:
X_train_c = X_train.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_c, y_train, X_test_c, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.075861,0.001587,0.865079,0.90798,0.699647



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.136173,0.017799,0.869841,0.920143,0.70073



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.013344,0.003445,0.879365,0.927475,0.716418



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.016469,0.002026,0.873016,0.928494,0.710145




organization              0.045704007148742676
age                       0.329967737197876
cnt_encounters            0.2501243054866791
cts                       0.0
contraceptive             0.0
anticoagulant             0.08895249664783478
cnt_medications           0.04823610186576843
cnt_procedures            0.0
last_encounter_dur        0.11974809318780899
gender_M                  0.11726725101470947
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using all features

In [None]:
run_exps(X_train_over, y_train_over, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.02734,0.007023,0.996825,0.991542,0.992366



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.174298,0.018368,0.987302,0.997437,0.970149



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.015084,0.009519,0.990476,0.991215,0.977099



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.01421,0.001369,0.996825,0.992424,0.992366




organization              0.0
age                       0.0
cnt_encounters            0.0
cts                       0.0
contraceptive             0.0
anticoagulant             0.0
cnt_medications           0.0
cnt_procedures            0.0
last_encounter_dur        0.0
condition_dur             1.0
gender_M                  0.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using just features from original tables

In [None]:
X_train_over_og = X_train_over[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_over_og, y_train_over, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.00998,0.003983,0.473016,0.443661,0.242009



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.150648,0.019029,0.473016,0.453184,0.242009



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.009413,0.002479,0.452381,0.433689,0.235033



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.016764,0.001282,0.452381,0.433689,0.235033




gender_M                  1.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and removing condition_dur feature

In [None]:
X_train_over_c = X_train_over.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_over_c, y_train_over, X_test_c, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.071383,0.002149,0.773016,0.890273,0.618667



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.142276,0.017866,0.866667,0.923421,0.710345



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.015408,0.006537,0.888889,0.930548,0.740741



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.017758,0.002059,0.896825,0.931423,0.767025




organization              0.05048897862434387
age                       0.16533686220645905
cnt_encounters            0.018800651654601097
cts                       0.0
contraceptive             0.0
anticoagulant             0.12795451283454895
cnt_medications           0.03252775967121124
cnt_procedures            0.0
last_encounter_dur        0.6048912405967712
gender_M                  0.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


## Train size: 350 and Test size: 350


### Split

In [None]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=True, random_state=42)

In [None]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=800)
encoder.fit(X_train[['organization']])
X_train[['organization']] = encoder.transform(X_train[['organization']])

X_test[['organization']] = encoder.transform(X_test[['organization']])

### SMOTE

In [None]:
oversample = SMOTE()
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Model Baseline

In [None]:
predictions = np.zeros(len(y_train), dtype=int)
y_prob_n = np.zeros(len(y_train), dtype=int)

print(f'Accuracy: {accuracy_score(y_train.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train.values,predictions)}')

Accuracy: 0.7828571428571428
AUC: 0.5
F1-Score: 0.0


In [None]:
predictions = np.zeros(len(y_train_over), dtype=int)
y_prob_n = np.zeros(len(y_train_over), dtype=int)

print(f'Accuracy: {accuracy_score(y_train_over.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train_over.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train_over.values,predictions)}')

Accuracy: 0.5
AUC: 0.5
F1-Score: 0.0


### Creating models


#### Imbalanced and using all features

In [None]:
run_exps(X_train, y_train, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.060815,0.00154,0.937143,0.989522,0.869048



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.155268,0.015358,0.997143,0.997699,0.993197



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.03832,0.004567,0.997143,0.994859,0.993197



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.028287,0.001426,0.997143,0.993782,0.993197




organization              0.008071950636804104
age                       0.007861930876970291
cnt_encounters            0.007651303894817829
cts                       0.0
contraceptive             0.0
anticoagulant             0.0
cnt_medications           0.008930639363825321
cnt_procedures            0.0
last_encounter_dur        0.022986896336078644
condition_dur             0.944497287273407
gender_M                  0.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### Imbalanced and using just features from original tables


In [None]:
X_train_og = X_train[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_og, y_train, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.010863,0.001202,0.788571,0.539243,0.0



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.170512,0.014481,0.782857,0.537823,0.0



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.012246,0.002915,0.788571,0.533441,0.0



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.01754,0.001049,0.788571,0.539292,0.0




gender_M                  0.7340509295463562
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.2659490704536438
encounterclass_outpatient 0.0


#### Imbalanced and removing condition_dur feature

In [None]:
X_train_c = X_train.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_c, y_train, X_test_c, y_test)

LogReg:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.130358,0.002686,0.865714,0.929201,0.700637



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.167726,0.017495,0.914286,0.962177,0.782609



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.035312,0.00431,0.908571,0.959459,0.768116



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.037252,0.002014,0.911429,0.96558,0.780142




organization              0.0811707079410553
age                       0.16338829696178436
cnt_encounters            0.08832219988107681
cts                       0.0
contraceptive             0.022805282846093178
anticoagulant             0.09741183370351791
cnt_medications           0.09947078675031662
cnt_procedures            0.0
last_encounter_dur        0.3879295587539673
gender_M                  0.059501372277736664
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using all features

In [None]:
run_exps(X_train_over, y_train_over, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.063539,0.00142,0.937143,0.989522,0.869048



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.165628,0.017733,0.997143,0.998654,0.993197



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.045895,0.005585,0.991429,0.995936,0.979866



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.040561,0.001445,0.997143,0.991872,0.993197




organization              0.003561829449608922
age                       0.004660072270780802
cnt_encounters            0.004116642754524946
cts                       0.0
contraceptive             0.0
anticoagulant             0.0
cnt_medications           0.004700628109276295
cnt_procedures            0.0
last_encounter_dur        0.016487708315253258
condition_dur             0.9664731621742249
gender_M                  0.0
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using just features from original tables

In [None]:
X_train_over_og = X_train_over[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_over_og, y_train_over, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.017571,0.002119,0.545714,0.539047,0.323404



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.158622,0.015173,0.54,0.531948,0.320675



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.011772,0.00219,0.537143,0.533441,0.325



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.025477,0.001207,0.542857,0.54507,0.327731




gender_M                  0.11957531422376633
encounterclass_ambulatory 0.6529111266136169
encounterclass_emergency  0.0
encounterclass_inpatient  0.2275136262178421
encounterclass_outpatient 0.0


#### With SMOTE and removing condition_dur feature

In [None]:
X_train_over_c = X_train_over.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_over_c, y_train_over, X_test_c, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.097316,0.001462,0.902857,0.957844,0.792683



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.168787,0.017616,0.925714,0.964649,0.826667



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.039734,0.004503,0.905714,0.961761,0.778523



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.047279,0.002294,0.897143,0.965922,0.772152




organization              0.02576737105846405
age                       0.08390796929597855
cnt_encounters            0.06748900562524796
cts                       0.0
contraceptive             0.04993510618805885
anticoagulant             0.09227531403303146
cnt_medications           0.06334460526704788
cnt_procedures            0.0
last_encounter_dur        0.4107513427734375
gender_M                  0.04245690256357193
encounterclass_ambulatory 0.16407237946987152
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


## Train size: 630 and Test size: 70


### Split

In [None]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

In [None]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=800)
encoder.fit(X_train[['organization']])
X_train[['organization']] = encoder.transform(X_train[['organization']])

X_test[['organization']] = encoder.transform(X_test[['organization']])

### SMOTE

In [None]:
oversample = SMOTE()
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Model Baseline

In [None]:
predictions = np.zeros(len(y_train), dtype=int)
y_prob_n = np.zeros(len(y_train), dtype=int)

print(f'Accuracy: {accuracy_score(y_train.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train.values,predictions)}')

Accuracy: 0.780952380952381
AUC: 0.5
F1-Score: 0.0


In [None]:
predictions = np.zeros(len(y_train_over), dtype=int)
y_prob_n = np.zeros(len(y_train_over), dtype=int)

print(f'Accuracy: {accuracy_score(y_train_over.values, predictions)}')
print(f'AUC: {roc_auc_score(y_train_over.values, y_prob_n)}')
print(f'F1-Score: {f1_score(y_train_over.values,predictions)}')

Accuracy: 0.5
AUC: 0.5
F1-Score: 0.0


### Creating models


#### Imbalanced and using all features

In [None]:
run_exps(X_train, y_train, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.104794,0.00215,0.985714,1.0,0.96



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.196934,0.014846,1.0,1.0,1.0



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.035207,0.002457,1.0,1.0,1.0



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.044368,0.001051,1.0,1.0,1.0




organization              0.01324392668902874
age                       0.009765137918293476
cnt_encounters            0.004924918059259653
cts                       0.0
contraceptive             0.0
anticoagulant             0.0
cnt_medications           0.006602118723094463
cnt_procedures            0.0
last_encounter_dur        0.00819142535328865
condition_dur             0.9498820304870605
gender_M                  0.007390392944216728
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### Imbalanced and using just features from original tables


In [None]:
X_train_og = X_train[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_og, y_train, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.010821,0.001245,0.828571,0.618534,0.0



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.144982,0.014272,0.828571,0.618534,0.0



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.01704,0.001878,0.828571,0.618534,0.0



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.029263,0.000783,0.828571,0.618534,0.0




gender_M                  0.6593327522277832
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.340667188167572
encounterclass_outpatient 0.0


#### Imbalanced and removing condition_dur feature

In [None]:
X_train_c = X_train.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_c, y_train, X_test_c, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.149785,0.001487,0.885714,0.961207,0.666667



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.201756,0.016036,0.942857,0.96408,0.818182



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.039542,0.00251,0.942857,0.968391,0.818182



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.049472,0.001153,0.942857,0.966954,0.833333




organization              0.06305547058582306
age                       0.15317782759666443
cnt_encounters            0.07054463028907776
cts                       0.0
contraceptive             0.05229188874363899
anticoagulant             0.09388680756092072
cnt_medications           0.07416640222072601
cnt_procedures            0.0
last_encounter_dur        0.4236450493335724
gender_M                  0.06923198699951172
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using all features

In [None]:
run_exps(X_train_over, y_train_over, X_test, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.113349,0.002015,0.985714,1.0,0.96



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.205129,0.013411,1.0,1.0,1.0



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.054345,0.002382,1.0,1.0,1.0



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.06684,0.001053,1.0,1.0,1.0




organization              0.011194324120879173
age                       0.006405092775821686
cnt_encounters            0.003515030490234494
cts                       0.0
contraceptive             0.00886967871338129
anticoagulant             0.0
cnt_medications           0.008657792583107948
cnt_procedures            0.0
last_encounter_dur        0.004320615436881781
condition_dur             0.9515252113342285
gender_M                  0.005512309726327658
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0


#### With SMOTE and using just features from original tables

In [None]:
X_train_over_og = X_train_over[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()
X_test_og = X_test[['gender_M', 'encounterclass_ambulatory', 'encounterclass_emergency', 'encounterclass_inpatient', 'encounterclass_outpatient']].copy()

run_exps(X_train_over_og, y_train_over, X_test_og, y_test)

LogReg:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.019817,0.001354,0.485714,0.398707,0.142857



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.161567,0.01766,0.485714,0.398707,0.142857



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.02179,0.001798,0.485714,0.398707,0.142857



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.03836,0.00084,0.485714,0.398707,0.142857




gender_M                  0.26016661524772644
encounterclass_ambulatory 0.21988803148269653
encounterclass_emergency  0.0
encounterclass_inpatient  0.33641237020492554
encounterclass_outpatient 0.18353301286697388


#### With SMOTE and removing condition_dur feature

In [None]:
X_train_over_c = X_train_over.drop('condition_dur', axis=1)
X_test_c = X_test.drop('condition_dur', axis=1)

run_exps(X_train_over_c, y_train_over, X_test_c, y_test)

LogReg:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.230419,0.003024,0.914286,0.966954,0.769231



RF:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.244641,0.013806,0.957143,0.984195,0.869565



LGBM:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.062709,0.002455,0.942857,0.965517,0.833333



XGB:


Unnamed: 0,Fit time,Score time,Accuracy,AUC,F1-Score
0,0.069787,0.001806,0.942857,0.965517,0.833333




organization              0.03061024844646454
age                       0.11664760112762451
cnt_encounters            0.05533641576766968
cts                       0.0
contraceptive             0.0419953353703022
anticoagulant             0.11844384670257568
cnt_medications           0.06529789417982101
cnt_procedures            0.0
last_encounter_dur        0.5408352017402649
gender_M                  0.030833423137664795
encounterclass_ambulatory 0.0
encounterclass_emergency  0.0
encounterclass_inpatient  0.0
encounterclass_outpatient 0.0
