# Heart Failure Clinical Records
This notebook will take the cleaned data and create, train, and save various machine learning models.

All of the models will be implemented using a pipeline so that it can be exported and easily used in a Flask application.

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Technically here I should only be using `serum_creatinine` and `ejection_fraction` for predictors but I
# get average results around 74% accurate.  Including `age` bumps it up a little, which makes sense.
# Adding `time` really helps but that's sketchy as discussed in the EDA.

all_data = pd.read_csv('./data/cleaned.csv', usecols=['serum_creatinine', 'ejection_fraction', 'age', 'DEATH_EVENT'])

In [None]:
# all_data['serum_creatinine'] = 10 ** all_data.serum_creatinine
# all_data['ejection_fraction'] = 10 ** all_data.ejection_fraction

## Helper Functions

Draws a learning curve to help visualize model progress and quality over time.

In [None]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")
    
    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    
    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

## Data Preparation
Splitting the data into train and test sets.  Values are chosen arbitrarily but sensibly.

In [None]:
from sklearn.model_selection import train_test_split

X = all_data.drop('DEATH_EVENT', axis=1)
y = all_data['DEATH_EVENT']

TEST_SIZE = 0.2
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV # Used by all as well.
from sklearn.pipeline import Pipeline # Same^
from sklearn.preprocessing import StandardScaler

# This is shared between all models.
kfold = StratifiedKFold(n_splits=10)

In [None]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
# https://neptune.ai/blog/f1-score-accuracy-roc-auc-pr-auc
SCORING_METRICS = [
    'accuracy', # Doesn't work well on imbalanced data, but nice to have around.
    'f1',       # Precision+Recall combined (kinda). Perfect for binary classification, which is what we have.
    'roc_auc'   # Again good for classification (TPR & FPR), but not good with imbalanced datasets.
]
MAIN_METRIC = 'f1'

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Pipeline.
pl_logreg = Pipeline([
    ('sc', StandardScaler()),
    ('lr', LogisticRegression())
])

# Randomized search space.
params_logreg = {
    'lr__C': np.linspace(0.0001, 1.0, 30),
    'lr__solver': ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']
}
cv_logreg = RandomizedSearchCV(
    pl_logreg,
    params_logreg,
    cv=kfold, scoring=SCORING_METRICS, refit=MAIN_METRIC, random_state=RANDOM_STATE, n_iter=100, n_jobs=-1
)

cv_logreg.fit(X_train, y_train)

In [None]:
# cv_logreg.best_score_
# cv_logreg.cv_results_.keys()

mean_accuracy = np.mean(cv_logreg.cv_results_['mean_test_accuracy'])
print(f'Mean accuracy: {mean_accuracy}')

mean_f1 = np.mean(cv_logreg.cv_results_['mean_test_f1'])
print(f'Mean F1: {mean_f1}')

mean_roc = np.mean(cv_logreg.cv_results_['mean_test_roc_auc'])
print(f'Mean ROC: {mean_roc}')

plot_learning_curve(cv_logreg.best_estimator_, 'Logistic Regression', X_train, y_train, cv=kfold)
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

pl_forest = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])
params_forest = {
    'rf__n_estimators': [50, 100, 150, 200],
    'rf__max_features': ['auto', 'sqrt'],
    'rf__max_depth': [2, 4, 8, 16],
    'rf__min_samples_split': [2, 3, 4, 5]
}
cv_forest = RandomizedSearchCV(
    pl_forest,
    params_forest,
    cv=kfold, scoring=SCORING_METRICS, refit=MAIN_METRIC, random_state=RANDOM_STATE, n_iter=100, n_jobs=-1
)
cv_forest.fit(X_train, y_train)

In [None]:
mean_accuracy = np.mean(cv_forest.cv_results_['mean_test_accuracy'])
print(f'Mean accuracy: {mean_accuracy}')

mean_f1 = np.mean(cv_forest.cv_results_['mean_test_f1'])
print(f'Mean F1: {mean_f1}')

mean_roc = np.mean(cv_forest.cv_results_['mean_test_roc_auc'])
print(f'Mean ROC: {mean_roc}')

plot_learning_curve(cv_forest.best_estimator_, 'Random Forest', X_train, y_train, cv=kfold)
plt.show()