# Random Forest algorithm to predict future decline in eGFR

In [None]:
# Data split in training and test dataset
data=PROFIL
X = PROFIL.drop('eGFR', axis=1)
target = PROFIL['eGFR']
X_train,X_test , y_train, y_test = train_test_split(X, target, test_size=0.2)

In [None]:
# Random Forest model fit
rfc=RandomForestClassifier(n_estimators=500)
rfc.fit(X_train, y_train)
pred_rfc=rfc.predict(X_test)
print(pred_rfc)

In [None]:
# Classification report and confusin matrix
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

In [None]:
# SHAP importance for eGFR decline
explainer= shap.TreeExplainer(rfc)
shap_values= explainer.shap_values(X)

#Plot
shap.summary_plot(shap_values[0], X) ## keep probabilities for the positive outcome only! eGFR=0!!

In [None]:
# Run classifier with K-fold cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)

rfc=RandomForestClassifier(n_estimators=500)
rfc.fit(X_train, y_train)
pred_rfc=rfc.predict_proba(X_test)[:, 0] ## keep probabilities for the positive outcome only! eGFR=0!!


tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)


### plot figure 
plt.figure(figsize=(10,7), dpi=100)
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X_train, y_train)):  
    rfc.fit(X_train, y_train)
    viz = plot_roc_curve(rfc, X_test, y_test,                  
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    
    

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic (ROC) curves")
ax.legend(loc="lower right")
plt.show()

In [None]:
# Monte Carlo simulations 
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=50, random_state=0, test_size=0.2)
cross_val_score(rfc, X_test, y_test, cv=cv, scoring='roc_auc')