In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats 
import seaborn as sns
from scipy.stats import ttest_ind

Import the cleaned data as a dataframe:

In [None]:
df = pd.read_pickle("cleaned_data.pkl")

We wish to visualise what relationships exist between our target variable ("stroke") and the features.
Experimenting with catplots of the numerical features, we see that age, hypertension and bmi have a clear difference in distribution, when we compare patients who have experienced a stroke versus those who haven't:

In [None]:
g = sns.catplot(x="stroke", 
                y="age", 
                data=df,  
                kind="violin",
                color="green")


We can verify this by doing t-tests for each future to see if the difference between the two populations of patients (stroke vs no stroke) is statistically different, using a standard p-value threshold of 0.05:

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
numeric_columns.remove("id")
numeric_columns.remove("stroke")
pos_mask = (df["stroke"] > 0.5)
positive_stroke = df[pos_mask]
negative_stroke = df[~pos_mask]
for col in numeric_columns:
    t_statistic, p_value = ttest_ind(positive_stroke[col], negative_stroke[col], equal_var=False)
    print(f"Results for {col}: \n t_statistic = {t_statistic}, p_value = {p_value} ")

Results for age: 
 t_statistic = 21.91012813839172, p_value = 7.371986041841149e-61 
Results for hypertension: 
 t_statistic = 5.886958676489632, p_value = 1.4809111107960078e-08 
Results for heart_disease: 
 t_statistic = 5.069316213180781, p_value = 8.624300407089754e-07 
Results for avg_glucose_level: 
 t_statistic = 1.2236024168406514, p_value = 0.2223516620988339 
Results for bmi: 
 t_statistic = 0.9910682785512911, p_value = 0.3226877381798354 

These are encouraging results that suggest three of our numerical features are correlated with stroke probability, and will thus have good predicitve power when it comes to training classification models later.

We can now do the same for our categorical features. For each feature, we can split the data by category and stroke outcome (0 or 1).
Any strong patterns will then hopefully become clear to see.

In [None]:
text_columns = df.select_dtypes(include=object).columns.tolist() # list of categorical features
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15, 6)) # first create pie charts for those with stroke = 1
fig.suptitle('Categorical features for stroke sufferers')
for i in range(len(text_columns)):
    col = text_columns[i]
    pos_data = positive_stroke[col]
    pos_category_split = dict(pos_data.value_counts())
    axes[i].pie(pos_data.value_counts(), labels=pos_category_split.keys(), autopct='%1.0f%%')
    axes[i].set_title(col, fontsize=12)
    fig.tight_layout()
    
    
    

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15, 6)) # now do the same for stroke = 0 
fig.suptitle('Categorical features for non stroke sufferers')
for i in range(len(text_columns)):
    col = text_columns[i]
    neg_data = negative_stroke[col]
    neg_category_split = dict(neg_data.value_counts())
    axes[i].pie(neg_data.value_counts(), labels=neg_category_split.keys(), autopct='%1.0f%%')
    axes[i].set_title(col, fontsize=12)
    fig.tight_layout()

The differences in distribution are less pronounced here, with only ever_married and smoking_status exhibiting moderate significance.
The marriage correlation could plausibly be simply as a weak proxy for age, which we already know to be a strong predictor.