In [2]:
import pandas as pd
import pickle

## Import cleaned dataset

In [16]:
final = pd.read_csv('./AKF_final_test.csv.gz', compression='gzip')
hadm_features = final.loc[:, '(\'min\', 50861)' : '(\'above_max\', 51498)']
hadm_target = final.loc[:, 'AKF']

In [18]:
from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(hadm_features, hadm_target, test_size=.2, stratify=hadm_target, random_state=25)

# Random Forest

## No Over/under sampling

In [43]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'entropy', 'max_depth': 90, 'n_estimators': 19}
Best CV score:  0.9204
Accuracy Score: 89.68%
_______________________________________________
Classification Report:	Precision Score: 65.00%
			Recall Score: 57.02%
			F1 score: 60.75%
_______________________________________________
Confusion Matrix: 
 [[665  35]
 [ 49  65]]

CPU times: user 4min 47s, sys: 4.35 s, total: 4min 51s
Wall time: 5min 2s


## undersampling using SMOTE

In [31]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [32]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'gini', 'max_depth': 70, 'n_estimators': 15}
Best CV score:  0.9536
Accuracy Score: 90.91%
_______________________________________________
Classification Report:	Precision Score: 63.89%
			Recall Score: 80.70%
			F1 score: 71.32%
_______________________________________________
Confusion Matrix: 
 [[648  52]
 [ 22  92]]

CPU times: user 11min 42s, sys: 12.4 s, total: 11min 54s
Wall time: 13min 23s


## Over sampling using Near Miss

In [34]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

In [35]:
%%time
param_grid={'criterion': ['entropy','gini'],
            'n_estimators': range(1,31),
            'max_depth': range(10,110,10)
            }

grid_search = model_selection.GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'criterion': 'gini', 'max_depth': 80, 'n_estimators': 22}
Best CV score:  0.9714
Accuracy Score: 69.04%
_______________________________________________
Classification Report:	Precision Score: 30.62%
			Recall Score: 95.61%
			F1 score: 46.38%
_______________________________________________
Confusion Matrix: 
 [[453 247]
 [  5 109]]

CPU times: user 1min 39s, sys: 1.69 s, total: 1min 40s
Wall time: 1min 44s


# Gradient Boost

## basic

In [45]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_train, y_train)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 5, 'max_features': 17, 'min_samples_split': 80, 'n_estimators': 60}
Best CV score:  0.9219
Accuracy Score: 92.01%
_______________________________________________
Classification Report:	Precision Score: 72.07%
			Recall Score: 70.18%
			F1 score: 71.11%
_______________________________________________
Confusion Matrix: 
 [[669  31]
 [ 34  80]]

CPU times: user 3min 19s, sys: 2.7 s, total: 3min 22s
Wall time: 3min 28s


## Undersampling using SMOTE¶

In [46]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [47]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 9, 'max_features': 7, 'min_samples_split': 20, 'n_estimators': 80}
Best CV score:  0.9571
Accuracy Score: 91.28%
_______________________________________________
Classification Report:	Precision Score: 66.17%
			Recall Score: 77.19%
			F1 score: 71.26%
_______________________________________________
Confusion Matrix: 
 [[655  45]
 [ 26  88]]

CPU times: user 7min 28s, sys: 9.4 s, total: 7min 37s
Wall time: 9min 26s


## Over sampling using Near Miss

In [48]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res, y_res = nm.fit_resample(X_train, y_train)

In [49]:
%%time
param_grid={'n_estimators':range(20,81,20),
            'max_depth':range(5,16,4),
            'min_samples_split':range(20,100,30),
            'max_features':range(7,20,5),
            }

grid_search = model_selection.GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = param_grid, cv=3)
grid_search.fit(X_res, y_res)

print("Cross Validation")
print("-" * 20)
print("Best parameter: ", grid_search.best_params_)
print("Best CV score:  %.4f" % grid_search.best_score_)

pred = grid_search.best_estimator_.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print("Classification Report:", end='')
print(f"\tPrecision Score: {precision_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tRecall Score: {recall_score(y_test, pred) * 100:.2f}%")
print(f"\t\t\tF1 score: {f1_score(y_test, pred) * 100:.2f}%")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

Cross Validation
--------------------
Best parameter:  {'max_depth': 13, 'max_features': 7, 'min_samples_split': 80, 'n_estimators': 60}
Best CV score:  0.9747
Accuracy Score: 64.86%
_______________________________________________
Classification Report:	Precision Score: 28.06%
			Recall Score: 96.49%
			F1 score: 43.48%
_______________________________________________
Confusion Matrix: 
 [[418 282]
 [  4 110]]

CPU times: user 1min 19s, sys: 844 ms, total: 1min 20s
Wall time: 1min 32s


### Try running with just creatinine items