In [1]:
# Step 0: Install and Import Libraries
# No additional installs needed for basics, but for visualization:
!pip install seaborn matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Step 1: Load the Dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/healthcare-dataset-stroke-data.csv')

In [5]:
# Step 5: Normalization / Scaling
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# First, let's identify numerical features if they haven't been defined
# Assuming you have a DataFrame called 'df' with your data

# If numerical_features is not defined, let's identify numerical columns
try:
    # Try to use existing numerical_features if available
    numerical_features_scaled = numerical_features
except NameError:
    # If not defined, identify numerical columns from the dataset
    print("numerical_features not found. Identifying numerical columns...")

    # Assuming you have df_no_outliers from previous steps
    # If not, we need to work with the original df or create it

    try:
        # Try to use df_no_outliers if available
        df_to_use = df_no_outliers
    except NameError:
        # If df_no_outliers not available, use original df
        print("df_no_outliers not found. Using original DataFrame.")
        df_to_use = df.copy()

    # Identify numerical columns (excluding target if it exists)
    numerical_cols = df_to_use.select_dtypes(include=['int64', 'float64']).columns

    # Remove target column if it's numerical and exists
    target_cols = [col for col in numerical_cols if 'stroke' in col.lower()]
    if target_cols:
        numerical_cols = numerical_cols.drop(target_cols)

    numerical_features_scaled = numerical_cols.tolist()
    print(f"Identified numerical features: {numerical_features_scaled}")

print(f"\nNumerical features to scale: {numerical_features_scaled}")

# ColumnTransformer for scaling only numerical
scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_scaled)
    ],
    remainder='passthrough'  # Keep categorical encoded and target
)

# Apply scaling (using df_no_outliers if available, otherwise use appropriate DataFrame)
try:
    scaled_data = scaler.fit_transform(df_no_outliers)
except NameError:
    print("df_no_outliers not found. Scaling original DataFrame.")
    scaled_data = scaler.fit_transform(df)

# Get feature names after scaling
scaled_cols = scaler.get_feature_names_out()

# Clean up column names (remove transformer prefixes)
cleaned_cols = []
for col in scaled_cols:
    if col.startswith('num__'):
        cleaned_cols.append(col.replace('num__', ''))
    elif col.startswith('remainder__'):
        cleaned_cols.append(col.replace('remainder__', ''))
    else:
        cleaned_cols.append(col)

df_scaled = pd.DataFrame(scaled_data, columns=cleaned_cols)

print("\nScaled dataset shape:", df_scaled.shape)
print("\nFirst few rows of scaled data:")
print(df_scaled.head())
print("\nScaled dataset info:")
print(df_scaled.info())

# Show summary statistics of scaled numerical features
print("\nSummary statistics of scaled numerical features:")
for feature in numerical_features_scaled:
    if feature in df_scaled.columns:
        print(f"\n{feature}:")
        print(f"  Mean: {df_scaled[feature].mean():.4f}")
        print(f"  Std: {df_scaled[feature].std():.4f}")
        print(f"  Min: {df_scaled[feature].min():.4f}")
        print(f"  Max: {df_scaled[feature].max():.4f}")

numerical_features not found. Identifying numerical columns...
df_no_outliers not found. Using original DataFrame.
Identified numerical features: ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

Numerical features to scale: ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
df_no_outliers not found. Scaling original DataFrame.

Scaled dataset shape: (5110, 12)

First few rows of scaled data:
         id       age hypertension heart_disease avg_glucose_level       bmi  \
0 -1.298312  1.051434    -0.328602      4.185032          2.706375  0.981345   
1  0.716371   0.78607    -0.328602     -0.238947          2.121559       NaN   
2 -0.255478   1.62639    -0.328602      4.185032         -0.005028  0.459269   
3  1.118363  0.255342    -0.328602     -0.238947          1.437358  0.701207   
4 -1.647136  1.582163     3.043196     -0.238947          1.501184 -0.623083   

   gender ever_married      work_type Residence_type   smoking_status str