In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.gridspec as grid_spec
import seaborn as sns
from imblearn.over_sampling import SMOTE


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score


from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    roc_auc_score,
    precision_score,
    f1_score,
)
import warnings

warnings.filterwarnings("ignore")

In [236]:
df = pd.read_csv("data.csv")
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,82,0,1,Yes,Self-employed,Rural,126.46,14.3,never smoked,1.0
1,Female,70,1,0,Yes,Private,Urban,214.77,15.0,formerly smoked,0.0
2,Female,72,0,0,Yes,Private,Rural,63.98,15.1,smokes,1.0


In [237]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  139
smoking_status       138
stroke                 1
dtype: int64

In [238]:
# A really fantsatic and intelligent way to deal with blanks, from Thoman Konstantin in: https://www.kaggle.com/thomaskonstantin/analyzing-and-modeling-stroke-data

DT_bmi_pipe = Pipeline(
    steps=[("scale", StandardScaler()), ("lr", DecisionTreeRegressor(random_state=42))]
)
X = df[["age", "gender", "bmi"]].copy()
X.gender = X.gender.replace({"Male": 0, "Female": 1, "Other": -1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop("bmi")
DT_bmi_pipe.fit(X, Y)
predicted_bmi = pd.Series(
    DT_bmi_pipe.predict(Missing[["age", "gender"]]), index=Missing.index
)
df.loc[Missing.index, "bmi"] = predicted_bmi

In [241]:
variables = [variable for variable in df.columns if variable not in ["stroke"]]

conts = ["age", "avg_glucose_level", "bmi"]

In [244]:
str_only = df[df["stroke"] == 1]
no_str_only = df[df["stroke"] == 0]

In [245]:
df["gender"] = (
    df["gender"].replace({"Male": 0, "Female": 1, "Other": -1}).astype(np.uint8)
)
df["Residence_type"] = (
    df["Residence_type"].replace({"Rural": 0, "Urban": 1}).astype(np.uint8)
)

In [246]:
print("Inverse of Null Accuracy: ", 249 / (249 + 4861))
print("Null Accuracy: ", 4861 / (4861 + 249))

Inverse of Null Accuracy:  0.0487279843444227
Null Accuracy:  0.9512720156555773


In [247]:
X = df[["gender", "age", "hypertension", "heart_disease", "avg_glucose_level", "bmi"]]
y = df["stroke"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [248]:
X_test.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi
1505,0,31,0,0,108.62,29.2
2398,1,55,1,0,99.82,34.2


In [249]:
oversample = SMOTE()
X_train_resh, y_train_resh = oversample.fit_resample(X_train, y_train.ravel())

In [250]:
# Models

# Scale our data in pipeline, then split

rf_pipeline = Pipeline(
    steps=[("scale", StandardScaler()), ("RF", RandomForestClassifier(random_state=42))]
)
svm_pipeline = Pipeline(
    steps=[("scale", StandardScaler()), ("SVM", SVC(random_state=42))]
)
logreg_pipeline = Pipeline(
    steps=[("scale", StandardScaler()), ("LR", LogisticRegression(random_state=42))]
)
dt_pipeline = Pipeline(
    steps=[("scale", StandardScaler()), ("DT", DecisionTreeClassifier(random_state=42))]
)

In [251]:
rf_cv = cross_val_score(rf_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1")
svm_cv = cross_val_score(svm_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1")
logreg_cv = cross_val_score(
    logreg_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1"
)
dt_cv = cross_val_score(dt_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1")

In [252]:
print("Mean f1 scores:")
print(
    "Random Forest mean :",
    cross_val_score(
        rf_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1"
    ).mean(),
)
print(
    "SVM mean :",
    cross_val_score(
        svm_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1"
    ).mean(),
)
print(
    "Logistic Regression mean :",
    cross_val_score(
        logreg_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1"
    ).mean(),
)
print(
    "Decision Tree mean :",
    cross_val_score(
        dt_pipeline, X_train_resh, y_train_resh, cv=10, scoring="f1"
    ).mean(),
)

Mean f1 scores:
Random Forest mean : 0.8479320320167847
SVM mean : 0.7901025971057598
Logistic Regression mean : 0.7941763000427062
Decision Tree mean : 0.8101010299090584


In [253]:
rf_pipeline.fit(X_train_resh, y_train_resh)
svm_pipeline.fit(X_train_resh, y_train_resh)
logreg_pipeline.fit(X_train_resh, y_train_resh)
dt_pipeline.fit(X_train_resh, y_train_resh)

rf_pred = rf_pipeline.predict(X_test)
svm_pred = svm_pipeline.predict(X_test)
logreg_pred = logreg_pipeline.predict(X_test)
destree_pred = dt_pipeline.predict(X_test)

rf_cm = confusion_matrix(y_test, rf_pred)
svm_cm = confusion_matrix(y_test, svm_pred)
logreg_cm = confusion_matrix(y_test, logreg_pred)
destree_cm = confusion_matrix(y_test, destree_pred)

rf_f1 = f1_score(y_test, rf_pred)
svm_f1 = f1_score(y_test, svm_pred)
logreg_f1 = f1_score(y_test, logreg_pred)
destree_f1 = f1_score(y_test, destree_pred)

ValueError: Input y_true contains NaN.

In [None]:
print("Mean f1 scores:")

print("RF mean :", rf_f1)
print("SVM mean :", svm_f1)
print("LR mean :", logreg_f1)

In [None]:
print(classification_report(y_test, rf_pred))

print("Accuracy Score: ", accuracy_score(y_test, rf_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

n_estimators = [64, 100, 128, 200]
max_features = [2, 3, 5, 7]
bootstrap = [True, False]

param_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "bootstrap": bootstrap,
}

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc = RandomForestClassifier(max_features=2, n_estimators=100, bootstrap=True)

rfc.fit(X_train_resh, y_train_resh)

rfc_tuned_pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, rfc_tuned_pred))

print("Accuracy Score: ", accuracy_score(y_test, rfc_tuned_pred))
print("F1 Score: ", f1_score(y_test, rfc_tuned_pred))

In [None]:
penalty = ["l1", "l2"]
C = [0.001, 0.01, 0.1, 1, 10, 100]

log_param_grid = {"penalty": penalty, "C": C}
logreg = LogisticRegression()
grid = GridSearchCV(logreg, log_param_grid)

In [None]:
# Let's use those params now

logreg_pipeline = Pipeline(
    steps=[
        ("scale", StandardScaler()),
        ("LR", LogisticRegression(C=0.1, penalty="l2", random_state=42)),
    ]
)

logreg_pipeline.fit(X_train_resh, y_train_resh)

# logreg.fit(X_train_resh,y_train_resh)

logreg_tuned_pred = logreg_pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, logreg_tuned_pred))

print("Accuracy Score: ", accuracy_score(y_test, logreg_tuned_pred))
print("F1 Score: ", f1_score(y_test, logreg_tuned_pred))

In [None]:
from sklearn.preprocessing import binarize

for i in range(1, 6):
    cm1 = 0
    y_pred1 = logreg_pipeline.predict_proba(X_test)[:, 1]
    y_pred1 = y_pred1.reshape(-1, 1)
    y_pred2 = binarize(y_pred1, threshold=i / 10)
    y_pred2 = np.where(y_pred2 == 1, 1, 0)
    cm1 = confusion_matrix(y_test, y_pred2)

    print(
        "With",
        i / 10,
        "threshold the Confusion Matrix is ",
        "\n\n",
        cm1,
        "\n\n",
        "with",
        cm1[0, 0] + cm1[1, 1],
        "correct predictions, ",
        "\n\n",
        cm1[0, 1],
        "Type I errors( False Positives), ",
        "\n\n",
        cm1[1, 0],
        "Type II errors( False Negatives), ",
        "\n\n",
        "Accuracy score: ",
        (accuracy_score(y_test, y_pred2)),
        "\n\n",
        "F1 score: ",
        (f1_score(y_test, y_pred2)),
        "\n\n",
        "Sensitivity: ",
        cm1[1, 1] / (float(cm1[1, 1] + cm1[1, 0])),
        "\n\n",
        "Specificity: ",
        cm1[0, 0] / (float(cm1[0, 0] + cm1[0, 1])),
        "\n\n",
        "====================================================",
        "\n\n",
    )

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

ns_probs = [0 for _ in range(len(y_test))]
lr_probs = logreg_pipeline.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)


y_scores = logreg_pipeline.predict_proba(X_train)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)


# Plots

fig = plt.figure(figsize=(12, 4))
gs = fig.add_gridspec(1, 2, wspace=0.1, hspace=0)
ax = gs.subplots()

background_color = "#f6f6f6"
fig.patch.set_facecolor(background_color)  # figure background color
ax[0].set_facecolor(background_color)
ax[1].set_facecolor(background_color)

ax[0].grid(color="gray", linestyle=":", axis="y", zorder=0, dashes=(1, 5))
ax[1].grid(color="gray", linestyle=":", axis="y", dashes=(1, 5))


y_scores = logreg_pipeline.predict_proba(X_train)[:, 1]


precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

ax[0].plot(thresholds, precisions[:-1], "b--", label="Precision", color="#9bb7d4")
ax[0].plot(thresholds, recalls[:-1], ".", linewidth=1, label="Recall", color="#0f4c81")
ax[0].set_ylabel("True Positive Rate", loc="bottom")
ax[0].set_xlabel("Thresholds", loc="left")
# plt.legend(loc='center left')
ax[0].set_ylim([0, 1])


# plot the roc curve for the model
ax[1].plot(ns_fpr, ns_tpr, linestyle="--", label="Dummy Classifer", color="gray")
ax[1].plot(lr_fpr, lr_tpr, marker=".", linewidth=2, color="#0f4c81")
ax[1].set_xlabel("False Positive Rate", loc="left")
ax[1].set_ylabel("")
ax[1].set_ylim([0, 1])

for s in ["top", "right", "left"]:
    ax[0].spines[s].set_visible(False)
    ax[1].spines[s].set_visible(False)


ax[0].text(
    -0.1,
    2,
    "Model Selection: Considerations",
    fontsize=18,
    fontfamily="serif",
    fontweight="bold",
)
ax[0].text(
    -0.1,
    1.26,
    """
Here we observe how our Logistic Regression model performs when we change the threshold.

We'd like a model that predicts all strokes, but in reality, this would come at a cost.
In fact we can create a model that succeeds in that goal, but it would mean predicting
most people to have a stroke - which in itself would have negative effects.

Therefore, we need to choose a model which not only predicts, correctly, those who have
strokes, but also those who do not.
""",
    fontsize=14,
    fontfamily="serif",
)


ax[0].text(
    -0.1, 1.1, "Precision & Recall", fontsize=14, fontfamily="serif", fontweight="bold"
)
ax[1].text(
    -0.1,
    1.1,
    "ROC: True Positives & False Positives",
    fontsize=14,
    fontfamily="serif",
    fontweight="bold",
)

ax[1].tick_params(axis="y", colors=background_color)

plt.show()

In [None]:
rf_df = pd.DataFrame(
    data=[
        f1_score(y_test, rf_pred),
        accuracy_score(y_test, rf_pred),
        recall_score(y_test, rf_pred),
        precision_score(y_test, rf_pred),
        roc_auc_score(y_test, rf_pred),
    ],
    columns=["Random Forest Score"],
    index=["F1", "Accuracy", "Recall", "Precision", "ROC AUC Score"],
)

svm_df = pd.DataFrame(
    data=[
        f1_score(y_test, svm_pred),
        accuracy_score(y_test, svm_pred),
        recall_score(y_test, svm_pred),
        precision_score(y_test, svm_pred),
        roc_auc_score(y_test, svm_pred),
    ],
    columns=["Support Vector Machine (SVM) Score"],
    index=["F1", "Accuracy", "Recall", "Precision", "ROC AUC Score"],
)

lr_df = pd.DataFrame(
    data=[
        f1_score(y_test, logreg_tuned_pred),
        accuracy_score(y_test, logreg_tuned_pred),
        recall_score(y_test, logreg_tuned_pred),
        precision_score(y_test, logreg_tuned_pred),
        roc_auc_score(y_test, logreg_tuned_pred),
    ],
    columns=["Tuned Logistic Regression Score"],
    index=["F1", "Accuracy", "Recall", "Precision", "ROC AUC Score"],
)

In [None]:
df_models = round(pd.concat([rf_df, svm_df, lr_df], axis=1), 3)
import matplotlib

colors = ["lightgray", "lightgray", "#0f4c81"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "#fbfbfb"

fig = plt.figure(figsize=(10, 8))  # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(
    df_models.T,
    cmap=colormap,
    annot=True,
    fmt=".1%",
    vmin=0,
    vmax=0.95,
    linewidths=2.5,
    cbar=False,
    ax=ax0,
    annot_kws={"fontsize": 12},
)
fig.patch.set_facecolor(background_color)  # figure background color
ax0.set_facecolor(background_color)

ax0.text(
    0, -2.15, "Model Comparison", fontsize=18, fontweight="bold", fontfamily="serif"
)
ax0.text(
    0,
    -0.9,
    "Random Forest performs the best for overall Accuracy,\nbut is this enough? Is Recall more important in this case?",
    fontsize=14,
    fontfamily="serif",
)
ax0.tick_params(axis="both", which="both", length=0)


plt.show()

In [None]:
# Plotting our results

colors = [
    "lightgray",
    "#0f4c81",
    "#0f4c81",
    "#0f4c81",
    "#0f4c81",
    "#0f4c81",
    "#0f4c81",
    "#0f4c81",
]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)


background_color = "#fbfbfb"

fig = plt.figure(figsize=(10, 14))  # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.8)
ax0 = fig.add_subplot(gs[0, :])
ax1 = fig.add_subplot(gs[1, :])
ax2 = fig.add_subplot(gs[2, :])
ax3 = fig.add_subplot(gs[3, :])
ax0.set_facecolor(background_color)  # axes background color

# Overall
sns.heatmap(
    rf_cm,
    cmap=colormap,
    annot=True,
    fmt="d",
    linewidths=5,
    cbar=False,
    ax=ax0,
    yticklabels=["Actual Non-Stroke", "Actual Stroke"],
    xticklabels=["Predicted Non-Stroke", "Predicted Stroke"],
    annot_kws={"fontsize": 12},
)

sns.heatmap(
    logreg_cm,
    cmap=colormap,
    annot=True,
    fmt="d",
    linewidths=5,
    cbar=False,
    ax=ax1,
    yticklabels=["Actual Non-Stroke", "Actual Stroke"],
    xticklabels=["Predicted Non-Stroke", "Predicted Stroke"],
    annot_kws={"fontsize": 12},
)

sns.heatmap(
    svm_cm,
    cmap=colormap,
    annot=True,
    fmt="d",
    linewidths=5,
    cbar=False,
    ax=ax2,
    yticklabels=["Actual Non-Stroke", "Actual Stroke"],
    xticklabels=["Predicted Non-Stroke", "Predicted Stroke"],
    annot_kws={"fontsize": 12},
)

sns.heatmap(
    destree_cm,
    cmap=colormap,
    annot=True,
    fmt="d",
    linewidths=5,
    cbar=False,
    ax=ax3,
    yticklabels=["Actual Non-Stroke", "Actual Stroke"],
    xticklabels=["Predicted Non-Stroke", "Predicted Stroke"],
    annot_kws={"fontsize": 12},
)


ax0.tick_params(axis="both", which="both", length=0)
background_color = "#fbfbfb"
fig.patch.set_facecolor(background_color)  # figure background color
ax0.set_facecolor(background_color)
ax1.tick_params(axis="both", which="both", length=0)
ax1.set_facecolor(background_color)
ax2.tick_params(axis="both", which="both", length=0)
ax2.set_facecolor(background_color)

ax0.text(
    0,
    -0.75,
    "Random Forest Performance",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)
ax0.text(
    0,
    -0.2,
    "The model has the highest accuracy, and predicts non-Strokes well.\nThe recall is poor though.",
    fontsize=14,
    fontfamily="serif",
)

ax1.text(
    0,
    -0.75,
    "Logistic Regression Performance",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)
ax1.text(
    0,
    -0.2,
    "This model predicts strokes with most success.\nHowever, it gives a lot of false-positives.",
    fontsize=14,
    fontfamily="serif",
)

ax2.text(
    0,
    -0.75,
    "Support Vector Machine Performance",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)
ax2.text(
    0,
    -0.2,
    "A very similar performance to Logistic Regression.\nThe recall is slightly less though.",
    fontsize=14,
    fontfamily="serif",
)

ax3.text(
    0,
    -0.75,
    "Decision Tree Performance",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)
ax3.text(
    0,
    -0.2,
    "A very similar performance to Logistic Regression.\nThe recall is slightly less though.",
    fontsize=14,
    fontfamily="serif",
)


plt.show()

In [None]:
colors = ["lightgray", "lightgray", "#0f4c81"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "#fbfbfb"

fig = plt.figure(figsize=(10, 8))  # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])
ax1 = fig.add_subplot(gs[1, :])

sns.heatmap(
    lr_df.T,
    cmap=colormap,
    annot=True,
    fmt=".1%",
    vmin=0,
    vmax=0.95,
    yticklabels="",
    linewidths=2.5,
    cbar=False,
    ax=ax0,
    annot_kws={"fontsize": 12},
)
fig.patch.set_facecolor(background_color)  # figure background color
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)

ax0.text(
    0,
    -2,
    "Tuned Logistic Regression Overview",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)
ax0.text(
    0,
    -0.3,
    """
A reminder of the results that the tuned model acheived.
The results are not perfect, but they do the best job at predicting those that will
suffer a stroke without sacrificing overall accuracy too much.

It has the highest f1 score of all models too - which is a weighted average of both
precision and recall.
""",
    fontsize=14,
    fontfamily="serif",
)
ax0.tick_params(axis="both", which="both", length=0)


# Overall

sns.heatmap(
    logreg_cm,
    cmap=colormap,
    annot=True,
    fmt="d",
    linewidths=5,
    cbar=False,
    ax=ax1,
    yticklabels=["Actual Non-Stroke", "Actual Stroke"],
    vmax=500,
    vmin=0,
    xticklabels=["Predicted Non-Stroke", "Predicted Stroke"],
    annot_kws={"fontsize": 12},
)
ax0.tick_params(axis="both", which="both", length=0)
ax1.tick_params(axis="both", which="both", length=0)
plt.show()

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame(
        {"Feature": df.columns, "Importance": m.feature_importances_}
    ).sort_values("Importance", ascending=False)


fi = rf_feat_importance(rf_pipeline["RF"], X)
fi[:10].style.background_gradient(cmap=colormap)

In [None]:
background_color = "#fbfbfb"

fig, ax = plt.subplots(1, 1, figsize=(10, 8), facecolor=background_color)

color_map = ["lightgray" for _ in range(10)]
color_map[0] = color_map[1] = color_map[2] = "#0f4c81"  # color highlight

sns.barplot(data=fi, x="Importance", y="Feature", ax=ax, palette=color_map)
ax.set_facecolor(background_color)
for s in ["top", "left", "right"]:
    ax.spines[s].set_visible(False)

fig.text(
    0.12,
    0.92,
    "Feature Importance: Random Forest Stroke Prediction",
    fontsize=18,
    fontweight="bold",
    fontfamily="serif",
)


plt.xlabel(" ", fontsize=12, fontweight="light", fontfamily="serif", loc="left", y=-1.5)
plt.ylabel(" ", fontsize=12, fontweight="light", fontfamily="serif")


fig.text(1.1, 0.92, "Insight", fontsize=18, fontweight="bold", fontfamily="serif")

fig.text(
    1.1,
    0.315,
    """
It is always interesting to view what features
a predictive model utilises the most, that is, 
what features are the most important. 
This not only helps understand how the model
works, but importantly can help us to explain
the model results.

In this case, we see that Age, Average Glucose Level,
and BMI are the most important factors for our model.

One also notices just how important Age is for our model,
it is by far the most significant variable.

It is also interesting that Work Type is more salient
than Gender - this is a surprise.

Having a history of Heart Disease and Hypertension
are also low in the importance ranking which again
is very surprising.
""",
    fontsize=14,
    fontweight="light",
    fontfamily="serif",
)

ax.tick_params(axis="both", which="both", length=0)


import matplotlib.lines as lines

l1 = lines.Line2D(
    [0.98, 0.98], [0, 1], transform=fig.transFigure, figure=fig, color="black", lw=0.2
)
fig.lines.extend([l1])


plt.show()

In [None]:
import shap

explainer = shap.TreeExplainer(rfc)

# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(X_test)

In [None]:
# %pip install lime

import lime
import lime.lime_tabular

# LIME has one explainer for all the models
explainer = lime.lime_tabular.LimeTabularExplainer(
    X.values,
    feature_names=X.columns.values.tolist(),
    class_names=["stroke"],
    verbose=True,
    mode="classification",
)

In [None]:
import lime
import lime.lime_tabular
from IPython.display import display
import webbrowser

# Assume X and model pipeline are already defined
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X.values,
    feature_names=X.columns,
    class_names=["no stroke", "stroke"],
    mode="classification",
)

j = 2890  # instance index
exp = explainer.explain_instance(
    X.values[j], logreg_pipeline.predict_proba, num_features=10
)

exp.save_to_file("lime_explanation.html")
webbrowser.open("lime_explanation.html")