# Build and Validate a Model

In [None]:
import pandas as pd
import numpy as np

# Allows charts to appear in the notebook
%matplotlib inline

In [None]:
# data source: UCI
# https://archive.ics.uci.edu/ml/datasets.php
data = pd.read_csv("heart_disease/heart.csv")

## Check Data

In [None]:
data.head(10)

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data['target'].value_counts() 

In [None]:
data.target.value_counts().plot(kind = 'barh')

In [None]:
#Get Target data 
y = data['target']

#Load X Variables into a Pandas Dataframe with columns 
X = data.drop(['target'], axis = 1)

In [None]:
print(f'X : {X.shape}')
print(f'y : {y.shape}')

In [None]:
X

In [None]:
y

## Divide Data into Train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

## Build Basic Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_Model = RandomForestClassifier().fit(X_train, y_train)

- In random forests, each tree in the ensemble is built from a sample drawn with replacement (i.e., a bootstrap sample) from the training set.

- Furthermore, when splitting each node during the construction of a tree, the best split is found either from all input features or a random subset of size max_features.

- The purpose of these two sources of randomness is to **decrease the variance of the forest estimator**. (High-dimensional data!)

- Indeed, individual decision trees typically exhibit high variance and tend to overfit. The injected randomness in forests yield decision trees with somewhat decoupled prediction errors. By taking an average of those predictions, some errors can cancel out. 

- Random forests achieve a reduced variance by combining diverse trees, sometimes at the cost of a slight increase in bias. In practice the variance reduction is often significant hence yielding an overall better model.

## Prediction

In [None]:
# Create a function to guess when an observation has heart disease 
def heartdisease(age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal):
    if(rf_Model.predict([[age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal]]))==1:
        print('You have heart disease!')
    else:
        print('You\'re fine!')

In [None]:
heartdisease(63,1,3,145,233,1,0,150,0,2.3,0,0,1)

In [None]:
heartdisease(56,0,1,130,236,0,0,174,0,0.0,1,1,2)

In [None]:
heartdisease(58,0,1,130,236,0,0,174,0,0.0,1,1,2)

## Check Accuracy

In [None]:
print ('Train Accuracy - : {:.3f}'.format(rf_Model.score(X_train,y_train)))
print ('Test Accuracy - : {:.3f}'.format(rf_Model.score(X_test,y_test)))

In [None]:
rf_Model.score(X_test,y_test)

## Cross Validation

<img src="./img/cross_val.png" width="600"/>

In [None]:
from sklearn.model_selection import cross_validate

val_res = cross_validate(RandomForestClassifier(), X, y, cv=5, return_train_score=True)

In [None]:
pd.DataFrame(val_res)

In [None]:
val_res['test_score'].mean()

Note cross validation doesn't give us a 'better' model. The purpose is to assess how well a given model generalizes. 

## Parameter Tuning & Model Selection 

In [None]:
# A simple manual example

best_score = 0

for n in [20, 50, 100, 300]:
    rf_model = RandomForestClassifier(n_estimators = n).fit(X_train, y_train)
    score = rf_model.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_parameters = {'n_estimators': n}

print("Best score: {:.4f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

In [None]:
# Create the param grid

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 300, num = 10)]
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
param_grid


- **GridSearch**
    - Exhaustive search over specified parameter values for an estimator.
    - GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.
    - The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.
    - details: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

- cv is cross-validation, 
    - here I use 3 fold cross validation 
    - for more info, please see: https://machinelearningmastery.com/k-fold-cross-validation/

- n_jobs = int, default=None
    - Number of jobs to run in parallel. 
    - None means 1 unless in a joblib.parallel_backend context. 
    - -1 means using all processors.

- verboseinteger
    - Controls the verbosity: the higher, the more messages.

In [None]:
from sklearn.model_selection import GridSearchCV

rf_Grid = GridSearchCV(estimator = RandomForestClassifier(), 
                       param_grid = param_grid, cv = 3, verbose=2, n_jobs = 6)

In [None]:
rf_Grid.fit(X_train, y_train)

In [None]:
rf_Grid.best_params_

## Check Accuracy Again

In [None]:
print ('Train Accuracy - : {:.4f}'.format(rf_Grid.score(X_train,y_train)))
print ('Test Accuracy - : {:.4f}'.format(rf_Grid.score(X_test,y_test)))


# Alternative Evaluation Metrics

In [None]:
rf_Model.score(X_test,y_test)

In [None]:
# Confusion matrix 

from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, rf_Model.predict(X_test))

In [None]:
confusion

Note the orientation of the confusion matrix here is different from the one we see in class: 

<img src="./img/confusion_matrix.png" width="400"/>

In [None]:
# accuracy score 
(23 + 31) / (23 + 4 + 3 + 31)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf_Model.predict(X_test), target_names=["no disease", "disease"]))

## Precision-Recall Curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, rf_Model.predict_proba(X_test)[:, 1])

Note `precision_recall_curve` requires the second argument to be a certainty measure of the positive class (class = 1)

In [None]:
rf_Model.predict_proba(X_test)

In [None]:
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best")

## ROC Curve

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, rf_Model.predict_proba(X_test)[:, 1])

In [None]:
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.legend(loc="best")

## Compare two models 

In [None]:
from sklearn.svm import SVC

svm_Model = SVC().fit(X_train, y_train)

In [None]:
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, svm_Model.decision_function(X_test))

In [None]:
plt.plot(fpr, tpr, label="ROC Curve RF")
plt.plot(fpr_svm, tpr_svm, label="ROC Curve SVM")

plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.legend(loc="best")

## AUC Score

In [None]:
from sklearn.metrics import roc_auc_score

rf_auc = roc_auc_score(y_test, rf_Model.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svm_Model.decision_function(X_test))

print("AUC for Random Forest: {:.3f}".format(rf_auc))
print("AUC for SVC: {:.3f}".format(svc_auc))

In [None]:
rf_Grid_2 = GridSearchCV(estimator = RandomForestClassifier(), 
                       param_grid = param_grid, cv = 3, verbose=2, n_jobs = 6, 
                       scoring = 'roc_auc')

In [None]:
rf_Grid_2.fit(X_train, y_train)

In [None]:
rf_Grid_2.best_params_

In [None]:
rf_Grid.best_params_

# Resources in R

- Algorithms: [rpart](https://cran.r-project.org/web/packages/rpart/) for trees, [randomForest](https://cran.r-project.org/web/packages/randomForest/) for RF, [e1071](https://cran.r-project.org/web/packages/e1071/index.html) or [LiblineaR](https://cran.r-project.org/web/packages/LiblineaR/index.html) for SVM
- Model Building: 
    - see [PRROC](https://cran.r-project.org/web/packages/PRROC/vignettes/PRROC.pdf), [ROCR](https://cran.r-project.org/web/packages/ROCR/index.html), or [cutpointr](https://cran.r-project.org/web/packages/cutpointr/vignettes/cutpointr.html) for scoring tools
    - see [mlr3](https://mlr3book.mlr-org.com/basics.html#learners) for an integrated interface