In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

from sklean
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

## Import cleaned dataset

In [2]:
final = pd.read_csv('./data/CHF_final_test.csv.gz', compression='gzip')
hadm_features = final.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51491)']
hadm_target = final.loc[:, 'CHF']

## Train test split

In [3]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(hadm_features, hadm_target, test_size=.2, stratify=hadm_target, random_state=25)

NameError: name 'model_selection' is not defined

In [4]:
# Over and under sample the train set
sm = SMOTE()
X_res_over, y_res_over = sm.fit_resample(X_train, y_train)

nm = NearMiss()
X_res_under, y_res_under = nm.fit_resample(X_train, y_train)

NameError: name 'X_train' is not defined

In [None]:
# check target counts
hadm_target[hadm_target==0].count()
hadm_target[hadm_target==1].count()

In [None]:
def results(model):
    '''
    Function to report on the metrics of a GridSearch model.
    Input: GridSearchCV model
    Prints: CV score, best parameters, accuracy, precision, recall, F1, confusion matrix and ROCAUC
    '''
    
#     cross validation scores
    print("Cross Validation")
    print("-" * 20)
    print("Best parameter: ", model.best_params_)
    print("Best CV score:  %.4f" % model.best_score_)

#     confusion matrix & related scores
    pred = model.best_estimator_.predict(X_test)
    print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
    print("_______________________________________________")
    print("Classification Report:", end='')
    print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
    print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
    print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
    
# #     ROCAUC
#     if model.estimator.__class__== sklearn.linear_model._logistic.LogisticRegression:
#         from sklearn.metrics import roc_curve, auc

#         y_pred = model.best_estimator_.decision_function(X_test)
#         fpr, tpr, threshold = roc_curve(y_test, y_pred)
#         auc = auc(fpr, tpr)

#         print("_______________________________________________")
#         print(f"Area Under Curve: {auc:.2f}%")

#         svc_disp = plot_roc_curve(model, X_test, y_test)
#         plt.show()
#     else:
#         print('not logistic')
    svc_disp = plot_roc_curve(model, X_test, y_test)
    plt.show()

In [None]:
results(grid_search_RF_basic)

## Logistic Regression

### basic

In [None]:
%%time

params ={'C':range(1, 1000, 200)}
logReg = LogisticRegression(penalty = 'l1', max_iter = 2000, class_weight = "balanced", solver = 'liblinear')

grid_log = GridSearchCV(estimator = logReg, param_grid = params, cv = 3)
grid_log.fit(X_train, y_train)
grid_log.best_score_

results(grid_log)

In [None]:
list(zip(X_train.columns, grid_log.best_estimator_.coef_[0]))

### Over Sampling using SMOTE

In [None]:
%%time
params ={'C':range(1, 1000, 200)}
logReg = LogisticRegression(penalty = 'l1', max_iter = 2000, class_weight = "balanced", solver = 'liblinear')

grid_log_over = GridSearchCV(estimator = logReg, param_grid = params, cv = 3)
grid_log_over.fit(X_res_over, y_res_over)
grid_log_over.best_score_

results(grid_log_Over)

### Under Sampling using Near Miss

In [None]:
%%time
params ={'C':range(1, 1000, 100)}
logReg = LogisticRegression(penalty = 'l1', max_iter = 2000, class_weight = "balanced", solver = 'liblinear')

grid_log_under = GridSearchCV(estimator = logReg, param_grid = params, cv = 3)
grid_log_under.fit(X_res_under, y_res_under)
grid_log_under.best_score_

results(grid_log_under)

## Random Forest

### Basic

In [None]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_rf = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_rf.fit(X_train, y_train)

results(grid_rf)

### Over Sampling using SMOTE

In [None]:
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_rf_over = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_rf_over.fit(X_res_over, y_res_over)

results(grid_rf_over)

### Under Sampling using Near Miss

In [None]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_rf_under = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_rf_under.fit(X_res_under, y_res_under)

results(grid_rf_under)

## Gradient Boost

In [None]:
# additive approach using 1. n_estimators 2. max_depth, min_samples
# perhaps use randomsearch instead of gridsearch
# look at area under curve (AUC)

### basic

In [None]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_gb = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_gb.fit(X_train, y_train)

results(grid_gb)

### Over Sampling using SMOTE¶

In [None]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_gb_over = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_gb_over.fit(X_res_over, y_res_over)

results(grid_gb_over)

### Under Sampling using Near Miss

In [None]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_gb_under = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_gb_under.fit(X_res, y_res)

results(grid_gb_under)

# KNN Imputed 

## Import cleaned dataset

In [None]:
KNN_final = pd.read_csv('./CHF_KNN_final_test.csv.gz', compression='gzip')
hadm_features = KNN_final.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51491)']
hadm_target = KNN_final.loc[:, 'CHF']

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(hadm_features, hadm_target, test_size=.2, stratify=hadm_target, random_state=25)

## Random Forest

### Basic

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

### undersampling using SMOTE

In [None]:
%%time

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)



param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

### Over sampling using Near Miss

In [None]:
%%time

from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)


param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

## Gradient Boost

### basic

In [None]:
%%time

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


### Over Sampling using SMOTE

In [None]:
%%time

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)


param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


### Under sampling using Near Miss

In [None]:
%%time

from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

### Try running with just creatinine items