In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/stroke_data.csv')
df.head()


In [14]:
# 1. Drop the 'id' column (not useful for prediction)
if 'id' in df.columns:
    df.drop('id', axis=1, inplace=True)

# 2. Fill missing values in 'bmi' with the median
df['bmi'].fillna(df['bmi'].median(), inplace=True)

# 3. Optional sanity check: confirm no more nulls
print("Missing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True)


EDA

In [None]:
# Import libraries (if not already at top of notebook)
import matplotlib.pyplot as plt
import seaborn as sns

# Filter only numeric columns for correlation analysis
numeric_df = df.select_dtypes(include='number')

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# KDE plot: avg_glucose_level distribution by stroke
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='avg_glucose_level', hue='stroke', fill=True)
plt.title('Glucose Level Distribution by Stroke')
plt.xlabel('Average Glucose Level')
plt.ylabel('Density')
plt.show()


Encoding

In [17]:
df_encoded = pd.get_dummies(df, drop_first=True)


 Train/Test Split + SMOTE Resampling

In [18]:
X = df_encoded.drop('stroke', axis=1)
y = df_encoded['stroke']


Spliting

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


SMOTE to training data only

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


Training model (Random Forest Classifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)


Making predictions

In [22]:
y_pred = model.predict(X_test)


 Evaluate the Model


In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[1388   70]
 [  62   13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1458
           1       0.16      0.17      0.16        75

    accuracy                           0.91      1533
   macro avg       0.56      0.56      0.56      1533
weighted avg       0.92      0.91      0.92      1533


Accuracy Score:
0.913894324853229
