In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('/content/data/raw/StrokeData.csv')

# Group: Handle Missing Data
# Group decision: Median imputation for bmi skewness
print("Missing values before:", df['bmi'].isnull().sum())
df['bmi'].fillna(df['bmi'].median(), inplace=True)
print("Missing values after:", df['bmi'].isnull().sum())

# EDA: Group-level histogram for bmi
plt.figure(figsize=(8, 6))
sns.histplot(df['bmi'], kde=True)
plt.title('BMI Distribution After Imputation')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.savefig('/content/results/eda_visualizations/bmi_distribution.png')
plt.show()
print("Group Interpretation: Median imputation (28.1) preserves bmi distribution, reducing bias in stroke class.")

# Member 4: Encoding Categorical Variables
# Member 4 - Withana W.Y.P
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Member 5: Outlier Removal
# Member 5 - De Silva P.K.N
def cap_outliers(df, cols=['avg_glucose_level', 'bmi']):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[col] = df[col].clip(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    return df
df = cap_outliers(df)

# Member 6: Normalization/Scaling
# Member 6 - Inshaf M J M
scaler = StandardScaler()
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Member 1: Feature Engineering - Binning
# Member 1 - Dilshan R.M.R
df['age_bin'] = pd.cut(df['age'], bins=[0, 30, 60, 82], labels=['0-30', '31-60', '61+'])
df['glucose_bin'] = pd.qcut(df['avg_glucose_level'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# Member 3: Feature Selection
# Member 3 - Pihara H.G.T
X = df.drop(['stroke', 'id'], axis=1)
y = df['stroke']
selector = SelectKBest(chi2, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected Features:", selected_features)

# Member 2: Dimension Reduction (PCA)
# Member 2 - Kuruppuarachchige K.A.H.B
pca = PCA(n_components=10)  # Adjust based on variance analysis
X_pca_reduced = pca.fit_transform(X[selected_features])

# Final DataFrame
df_final = pd.DataFrame(X_pca_reduced, columns=[f'PC{i+1}' for i in range(X_pca_reduced.shape[1])])
df_final['stroke'] = y.reset_index(drop=True)

# Save final dataset
df_final.to_csv('/content/results/outputs/stroke_processed_pca.csv', index=False)
print("Final dataset saved as stroke_processed_pca.csv")

# Split for model training (optional for demo)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_final.drop('stroke', axis=1), df_final['stroke'], test_size=0.2, random_state=42, stratify=df_final['stroke'])
X_train.to_csv('/content/results/outputs/X_train.csv', index=False)
y_train.to_csv('/content/results/outputs/y_train.csv', index=False)
X_test.to_csv('/content/results/outputs/X_test.csv', index=False)
y_test.to_csv('/content/results/outputs/y_test.csv', index=False)
print("Training and testing sets saved.")
