In [2]:
import pandas as pd
import pickle

## Import cleaned dataset

In [3]:
final = pd.read_csv('./CHF_final_test.csv.gz', compression='gzip')
hadm_features = final.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51491)']
hadm_target = final.loc[:, 'CHF']

In [4]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(hadm_features, hadm_target, test_size=.2, stratify=hadm_target, random_state=25)

## Random Forest

### Basic

In [5]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 21}
Best CV score:  0.9124
Accuracy Score: 91.55%
_______________________________________________
Classification Report:	Precision Score: 67.19%
			Recall Score: 26.88%
			F1 score: 38.39%
_______________________________________________
Confusion Matrix: 
 [[1453   21]
 [ 117   43]]

CPU times: user 25.6 s, sys: 744 ms, total: 26.4 s
Wall time: 33.2 s


### undersampling using SMOTE

In [6]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [7]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'gini', 'max_depth': 90, 'n_estimators': 21}
Best CV score:  0.9441
Accuracy Score: 89.60%
_______________________________________________
Classification Report:	Precision Score: 47.28%
			Recall Score: 54.37%
			F1 score: 50.58%
_______________________________________________
Confusion Matrix: 
 [[1377   97]
 [  73   87]]

CPU times: user 1min 4s, sys: 2.11 s, total: 1min 6s
Wall time: 1min 15s


### Over sampling using Near Miss

In [8]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

In [9]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31,10),
            'max_depth': range(10,110,20)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 11}
Best CV score:  0.9953
Accuracy Score: 33.78%
_______________________________________________
Classification Report:	Precision Score: 12.76%
			Recall Score: 98.75%
			F1 score: 22.60%
_______________________________________________
Confusion Matrix: 
 [[ 394 1080]
 [   2  158]]

CPU times: user 4.06 s, sys: 98 ms, total: 4.16 s
Wall time: 4.53 s


## Gradient Boost

### basic

In [16]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 5, 'max_features': 12, 'min_samples_split': 20, 'n_estimators': 80}
Best CV score:  0.9176
Accuracy Score: 92.23%
_______________________________________________
Classification Report:	Precision Score: 68.97%
			Recall Score: 37.50%
			F1 score: 48.58%
_______________________________________________
Confusion Matrix: 
 [[1447   27]
 [ 100   60]]

CPU times: user 6min 44s, sys: 9.9 s, total: 6min 54s
Wall time: 7min 39s


### Undersampling using SMOTE¶

In [11]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [12]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 13, 'max_features': 12, 'min_samples_split': 20, 'n_estimators': 80}
Best CV score:  0.9584
Accuracy Score: 91.25%
_______________________________________________
Classification Report:	Precision Score: 56.39%
			Recall Score: 46.88%
			F1 score: 51.19%
_______________________________________________
Confusion Matrix: 
 [[1416   58]
 [  85   75]]

CPU times: user 15min 20s, sys: 21.7 s, total: 15min 41s
Wall time: 17min 39s


### Over sampling using Near Miss

In [13]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

In [14]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 13, 'max_features': 7, 'min_samples_split': 80, 'n_estimators': 20}
Best CV score:  0.9945
Accuracy Score: 36.11%
_______________________________________________
Classification Report:	Precision Score: 13.23%
			Recall Score: 99.38%
			F1 score: 23.35%
_______________________________________________
Confusion Matrix: 
 [[ 431 1043]
 [   1  159]]

CPU times: user 1min 29s, sys: 2.02 s, total: 1min 31s
Wall time: 1min 41s


# KNN Imputed 

## Import cleaned dataset

In [17]:
KNN_final = pd.read_csv('./CHF_KNN_final_test.csv.gz', compression='gzip')
hadm_features = KNN_final.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51491)']
hadm_target = KNN_final.loc[:, 'CHF']

In [18]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(hadm_features, hadm_target, test_size=.2, stratify=hadm_target, random_state=25)

## Random Forest

### Basic

In [19]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'gini', 'max_depth': 40, 'n_estimators': 23}
Best CV score:  0.9146
Accuracy Score: 91.25%
_______________________________________________
Classification Report:	Precision Score: 65.45%
			Recall Score: 22.50%
			F1 score: 33.49%
_______________________________________________
Confusion Matrix: 
 [[1455   19]
 [ 124   36]]

CPU times: user 13min 17s, sys: 20.3 s, total: 13min 37s
Wall time: 15min 4s


### undersampling using SMOTE

In [22]:
%%time

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)



param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 30}
Best CV score:  0.9487
Accuracy Score: 90.21%
_______________________________________________
Classification Report:	Precision Score: 50.00%
			Recall Score: 44.38%
			F1 score: 47.02%
_______________________________________________
Confusion Matrix: 
 [[1403   71]
 [  89   71]]

CPU times: user 34min 22s, sys: 1min 1s, total: 35min 23s
Wall time: 38min 38s


### Over sampling using Near Miss

In [27]:
%%time

from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)


param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 26}
Best CV score:  0.9657
Accuracy Score: 56.24%
_______________________________________________
Classification Report:	Precision Score: 18.07%
			Recall Score: 98.12%
			F1 score: 30.52%
_______________________________________________
Confusion Matrix: 
 [[762 712]
 [  3 157]]

CPU times: user 2min 48s, sys: 4.07 s, total: 2min 52s
Wall time: 4min 16s


## Gradient Boost

### basic

In [23]:
%%time

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


Cross Validation
--------------------
Best parameter:  {'max_depth': 13, 'max_features': 7, 'min_samples_split': 80, 'n_estimators': 60}
Best CV score:  0.9164
Accuracy Score: 92.35%
_______________________________________________
Classification Report:	Precision Score: 73.33%
			Recall Score: 34.38%
			F1 score: 46.81%
_______________________________________________
Confusion Matrix: 
 [[1454   20]
 [ 105   55]]

CPU times: user 7min 24s, sys: 10.6 s, total: 7min 34s
Wall time: 8min 12s


### Undersampling using SMOTE

In [25]:
%%time

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)


param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


Cross Validation
--------------------
Best parameter:  {'max_depth': 13, 'max_features': 7, 'min_samples_split': 20, 'n_estimators': 80}
Best CV score:  0.9579
Accuracy Score: 91.37%
_______________________________________________
Classification Report:	Precision Score: 57.14%
			Recall Score: 47.50%
			F1 score: 51.88%
_______________________________________________
Confusion Matrix: 
 [[1417   57]
 [  84   76]]

CPU times: user 17min 42s, sys: 29 s, total: 18min 11s
Wall time: 23min 39s


### Over sampling using Near Miss

In [26]:
%%time

from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 5, 'max_features': 12, 'min_samples_split': 20, 'n_estimators': 80}
Best CV score:  0.9727
Accuracy Score: 57.04%
_______________________________________________
Classification Report:	Precision Score: 18.27%
			Recall Score: 97.50%
			F1 score: 30.77%
_______________________________________________
Confusion Matrix: 
 [[776 698]
 [  4 156]]

CPU times: user 1min 48s, sys: 2.4 s, total: 1min 50s
Wall time: 2min 37s


### Try running with just creatinine items