# Group Pipeline for Stroke Prediction Data Preprocessing and Feature Engineering
This notebook integrates all the individual data preprocessing and feature engineering steps into a single, cohesive pipeline. It covers handling missing data, encoding categorical variables, outlier removal, normalization/scaling, feature selection, and dimensionality reduction using PCA.

In [1]:
# Load the dataset once at the beginning
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

data_path = '../data/raw/StrokeData.csv'  # Use relative path from notebooks directory
df = pd.read_csv(data_path)

# Step 1: Handling missing BMI data (mean imputation)
print('\nMissing values before imputation:')
print(df.isnull().sum())

bmi_mean = df['bmi'].mean()
df['bmi'].fillna(bmi_mean, inplace=True)

print('\nMissing values after imputation:')
print(df.isnull().sum())

# Step 2: Encoding categorical variables (one-hot encoding)
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Drop unnecessary columns
df_encoded.drop(['id'], axis=1, inplace=True)  # Assuming id is not needed

# Step 3: Outlier removal (IQR method, before scaling)
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
for col in numerical_cols:
    Q1 = df_encoded[col].quantile(0.25)
    Q3 = df_encoded[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_encoded[col] = np.clip(df_encoded[col], lower_bound, upper_bound)

print('\nData after outlier removal:')

# Step 4: Normalization/Scaling (after outlier removal)
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print('\nData after scaling:')

# Step 5: Feature Selection (using RandomForest)
X = df_encoded.drop('stroke', axis=1)
y = df_encoded['stroke']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features = importances[importances > 0.01].index.tolist()

print('\nSelected features:', selected_features)

# Step 6: Dimension Reduction (PCA on selected features)
X_selected = X[selected_features]
pca = PCA(n_components=0.95)  # Retain 95% variance
X_pca = pca.fit_transform(X_selected)

print('\nShape after PCA:', X_pca.shape)

# Final preprocessed data
df_final = pd.DataFrame(X_pca)
df_final['stroke'] = y.values

print('\nFinal preprocessed dataframe head:')
print(df_final.head())

# Save the final dataset
df_final.to_csv('../results/outputs/preprocessed_stroke_data.csv', index=False)


Missing values before imputation:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Missing values after imputation:
id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Data after outlier removal:

Data after scaling:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(bmi_mean, inplace=True)



Selected features: ['age', 'bmi', 'avg_glucose_level', 'gender_Male', 'Residence_type_Urban', 'hypertension', 'smoking_status_never smoked', 'work_type_Private', 'heart_disease', 'smoking_status_formerly smoked', 'work_type_Self-employed', 'smoking_status_smokes', 'ever_married_Yes']

Shape after PCA: (5110, 10)

Final preprocessed dataframe head:
          0         1         2         3         4         5         6  \
0  2.305606  1.264816  0.250546 -0.430617  0.834524  0.333307  0.320894   
1  1.453986  1.647613 -0.484745  0.160239 -1.163737 -0.412336 -0.231320   
2  1.571330 -0.416363 -0.638432  0.597845  0.169697 -0.612380  0.748779   
3  1.537974  1.474840  0.641771  0.057266  0.514872  0.671681 -0.587822   
4  1.642192  1.709071 -1.520156  0.258872 -1.103096 -0.424797 -0.188216   

          7         8         9  stroke  
0 -0.643058 -0.062396 -0.212238       1  
1  0.114952  0.208955 -0.247852       1  
2  0.043821 -0.296528 -0.280570       1  
3  0.692612  0.170372 -0.12592