In [1]:
# Step 0: Install and Import Libraries
# No additional installs needed for basics, but for visualization:
!pip install seaborn matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Step 1: Load the Dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/healthcare-dataset-stroke-data.csv')

In [4]:
# Step 3: Encoding Categorical Variables
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
print("Categorical columns:", categorical_cols)

# Handle 'Other' in gender if present (rare, but encode)
# OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep numerical columns as is
)

# Apply encoding (we'll fit on full data for simplicity; in practice, fit on train)
encoded_data = preprocessor.fit_transform(df)
encoded_cols = preprocessor.get_feature_names_out()

df_encoded = pd.DataFrame(encoded_data, columns=encoded_cols)
print("\nEncoded dataset shape:", df_encoded.shape)
print("\nFirst few rows of encoded data:")
print(df_encoded.head())

# Note: 'id' and 'stroke' are passthrough. Drop 'id' later as it's not a feature.
df_processed = df_encoded.drop(columns=['remainder__id'])  # Remove id
print("\nProcessed shape after dropping id:", df_processed.shape)

Categorical columns: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

Encoded dataset shape: (5110, 18)

First few rows of encoded data:
   cat__gender_Male  cat__gender_Other  cat__ever_married_Yes  \
0               1.0                0.0                    1.0   
1               0.0                0.0                    1.0   
2               1.0                0.0                    1.0   
3               0.0                0.0                    1.0   
4               0.0                0.0                    1.0   

   cat__work_type_Never_worked  cat__work_type_Private  \
0                          0.0                     1.0   
1                          0.0                     0.0   
2                          0.0                     1.0   
3                          0.0                     1.0   
4                          0.0                     0.0   

   cat__work_type_Self-employed  cat__work_type_children  \
0                           0.0    