In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2

# Load preprocessed data
df = pd.read_csv('/content/data/raw/StrokeData.csv')
df['bmi'].fillna(df['bmi'].median(), inplace=True)  # Group task placeholder
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

# Prepare features and target
X = df.drop(['stroke', 'id'], axis=1)
y = df['stroke']

# Feature selection with SelectKBest
k_best = SelectKBest(score_func=chi2, k=10)
X_selected = k_best.fit_transform(X, y)
selected_features = X.columns[k_best.get_support()].tolist()

# Display selected features
print("Selected Features:", selected_features)

# EDA: Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df[selected_features].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Selected Features')
plt.savefig('/content/results/eda_visualizations/feature_correlation.png')
plt.show()

# Interpretation
print("Interpretation: Features like 'age' (corr 0.25 with stroke) and 'avg_glucose_level' are highly correlated, justifying selection for model focus.")
