In [1]:
import pandas as pd

# Load the dataset
heart_2022 = "C:/Users/ashwa/Downloads/heart/2022/heart_2022_with_nans.csv"



In [None]:
heart_data = pd.read_csv(heart_2022)

# Display basic information about the dataset
heart_data.info(), heart_data.head()

In [3]:
missing_values = heart_data.isnull().sum()

missing_values

State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
LastCheckupTime               8308
PhysicalActivities            1093
SleepHours                    5453
RemovedTeeth                 11360
HadHeartAttack                3065
HadAngina                     4405
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
ChestScan                    56046
RaceEthnicityCategor

In [4]:
# Identify columns with more than 50% missing values
threshold = 0.5 * len(heart_data)  # 50% of the dataset size
columns_to_drop = [col for col in heart_data.columns if heart_data[col].isnull().sum() > threshold]

# Drop columns with excessive missing data
heart_data_cleaned = heart_data.drop(columns=columns_to_drop)

# Check the remaining columns and their missing values
remaining_missing_values = heart_data_cleaned.isnull().sum()

columns_to_drop, remaining_missing_values


([],
 State                            0
 Sex                              0
 GeneralHealth                 1198
 PhysicalHealthDays           10927
 MentalHealthDays              9067
 LastCheckupTime               8308
 PhysicalActivities            1093
 SleepHours                    5453
 RemovedTeeth                 11360
 HadHeartAttack                3065
 HadAngina                     4405
 HadStroke                     1557
 HadAsthma                     1773
 HadSkinCancer                 3143
 HadCOPD                       2219
 HadDepressiveDisorder         2812
 HadKidneyDisease              1926
 HadArthritis                  2633
 HadDiabetes                   1087
 DeafOrHardOfHearing          20647
 BlindOrVisionDifficulty      21564
 DifficultyConcentrating      24240
 DifficultyWalking            24012
 DifficultyDressingBathing    23915
 DifficultyErrands            25656
 SmokerStatus                 35462
 ECigaretteUsage              35660
 ChestScan             

In [5]:
# Separate columns into categorical and numerical
categorical_cols = heart_data_cleaned.select_dtypes(include='object').columns
numerical_cols = heart_data_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Correct imputation without inplace=True
for col in categorical_cols:
    heart_data_cleaned[col] = heart_data_cleaned[col].fillna(heart_data_cleaned[col].mode()[0])

for col in numerical_cols:
    heart_data_cleaned[col] = heart_data_cleaned[col].fillna(heart_data_cleaned[col].median())

# Verify that there are no missing values left
final_missing_values_corrected = heart_data_cleaned.isnull().sum().sum()

final_missing_values_corrected



0

In [6]:
import numpy as np
from sklearn.preprocessing import RobustScaler
# Step 2: Apply Transformations to Handle Outliers (Log Transform for Skewness)
skewed_cols = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'BMI']
for col in skewed_cols:
    heart_data[col] = heart_data[col].apply(lambda x: np.log1p(x) if x > 0 else x)

# Step 3: Scale Numerical Features Using RobustScaler
scaler = RobustScaler()
heart_data[numerical_cols] = scaler.fit_transform(heart_data[numerical_cols])
# Step 4: Validate and Standardize Data Types
# Ensure numerical columns are numeric and categorical columns are strings
for col in categorical_cols:
    heart_data[col] = heart_data[col].astype(str)

# Step 5: Encode Categorical Variables
# One-hot encode 'Sex' and label encode 'GeneralHealth'
heart_data = pd.get_dummies(heart_data, columns=['Sex'], drop_first=True)  # One-hot encoding for 'Sex'
heart_data['GeneralHealth'] = heart_data['GeneralHealth'].astype('category').cat.codes  # Label encoding

# Step 6: Feature Engineering - Create BMI Categories
heart_data['BMICategory'] = pd.cut(
    heart_data['BMI'],
    bins=[0, 18.5, 25, 30, np.inf],
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)

# Step 7: Prepare Target Variable
# Convert 'HadHeartAttack' to binary (0/1)
heart_data['HadHeartAttack'] = heart_data['HadHeartAttack'].map({'No': 0, 'Yes': 1}).fillna(0).astype(int)

# Verify the preprocessed dataset
heart_data.info(), heart_data['HadHeartAttack'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 41 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   State                      445132 non-null  object  
 1   GeneralHealth              445132 non-null  int8    
 2   PhysicalHealthDays         434205 non-null  float64 
 3   MentalHealthDays           436065 non-null  float64 
 4   LastCheckupTime            445132 non-null  object  
 5   PhysicalActivities         445132 non-null  object  
 6   SleepHours                 439679 non-null  float64 
 7   RemovedTeeth               445132 non-null  object  
 8   HadHeartAttack             445132 non-null  int32   
 9   HadAngina                  445132 non-null  object  
 10  HadStroke                  445132 non-null  object  
 11  HadAsthma                  445132 non-null  object  
 12  HadSkinCancer              445132 non-null  object  
 13  HadCOPD       

(None,
 HadHeartAttack
 0    420024
 1     25108
 Name: count, dtype: int64)

In [7]:
# Step 8: Standardize Column Names
heart_data.columns = (
    heart_data.columns
    .str.strip()  # Remove leading/trailing whitespaces
    .str.lower()  # Convert to lowercase
    .str.replace(' ', '_')  # Replace spaces with underscores
    .str.replace('[^a-zA-Z0-9_]', '')  # Remove special characters
)

# Step 9: Save the preprocessed dataset as a CSV file
output_file_path = 'C:/Users/ashwa/OneDrive/Desktop/cleaned_standardized_heart_data.csv'
heart_data.to_csv(output_file_path, index=False)