In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load data
df = pd.read_csv('/content/data/raw/StrokeData.csv')
df['bmi'].fillna(df['bmi'].median(), inplace=True)  # Group task placeholder

# Identify numerical columns for outlier removal
numerical_cols = ['avg_glucose_level', 'bmi']

# Visualize before
plt.figure(figsize=(12, 5))
for i, col in enumerate(numerical_cols):
    plt.subplot(1, 2, i+1)
    sns.boxplot(x=df[col])
    plt.title(f'Box plot of {col} before outlier handling')
plt.tight_layout()
plt.savefig('/content/results/eda_visualizations/before_outlier_box.png')
plt.show()

# IQR method to cap outliers
df_cleaned = df.copy()
for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)]
    print(f"{col} - Outliers detected: {len(outliers)}")
    df_cleaned[col] = df_cleaned[col].clip(lower_bound, upper_bound)

# Visualize after
plt.figure(figsize=(12, 5))
for i, col in enumerate(numerical_cols):
    plt.subplot(1, 2, i+1)
    sns.boxplot(x=df_cleaned[col])
    plt.title(f'Box plot of {col} after outlier handling')
plt.tight_layout()
plt.savefig('/content/results/eda_visualizations/after_outlier_box.png')
plt.show()

# Interpretation
print("Interpretation: Capping 627 glucose and 126 bmi outliers reduces variance, retaining high-risk cases for better model fit.")
