# Required Models 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [None]:
df.head()

200 patients' bmi was missing, which is a small number for this dataset, it was replaced with the mean

In [None]:
print(df.isnull().sum())

In [None]:
mean_bmi = df['bmi'].mean()

# Replace NaN values in column 'A' with its mean
df['bmi'].fillna(mean_bmi, inplace=True)

In [None]:
print(df.isnull().sum())

Note: dataset is imbalanced

In [None]:
df["stroke"].value_counts()

In [None]:
df.dtypes

For the models to work, i one-hot encoded these columns

In [None]:
columns_to_encode = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True,)


In [None]:
df.head()

Dataset has been split in 80/20

In [None]:


X = df.drop('stroke', axis=1)
y = df['stroke']

# Stratify keeps same % of stroke vs non-stroke in train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Baseline model shows a weighted average recall of only 74%

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("RF ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest shows no false positives, which is a good sign for a model based on a medical dataset>

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()


Age and glucose level seem to have the biggest effect on strokes, bmi in 200 ids was used as the mean, hence its feature importance cant be sure.

In [None]:


importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).plot(kind='bar', figsize=(12,5))
plt.title("Feature Importance")
plt.show()


Author:
# Hadi Faheem Farooqi

Date: 24/09/2025

Dataset: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?utm_source=chatgpt.com