In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [77]:
data = pd.read_csv('../Dataset Creation/synthetic_heart_attack_data_balanced.csv')

In [78]:
numerical_cols = [
    "Heart_Rate_BPM", "HRV_ms", "SpO2_Percentage", "Skin_Temperature_C", 
    "EDA_GSR", "Ambient_Temperature_C", "Humidity_Percentage", "IMU_X", "IMU_Y", "IMU_Z", "Stress_Level"
]
data[numerical_cols] = data[numerical_cols].fillna(-1)

In [79]:
data.loc[data["Device_Status"] == "Offline", ["IMU_X", "IMU_Y", "IMU_Z"]] = 0

In [80]:
categorical_cols = ["Device_Status", "Heart Attack Risk"]
data[categorical_cols] = data[categorical_cols].fillna("Unknown")

In [81]:
data[numerical_cols] = data[numerical_cols].astype(float)

In [82]:
# Handle outliers by capping values
outlier_limits = {
    "Heart_Rate_BPM": (40, 200),
    "HRV_ms": (10, 150),
    "SpO2_Percentage": (70, 100),
    "Skin_Temperature_C": (34, 40),
    "EDA_GSR": (0.1, 20),
    "Ambient_Temperature_C": (10, 45),
    "Humidity_Percentage": (10, 100)
}
for col, (low, high) in outlier_limits.items():
    data[col] = np.clip(data[col], low, high)

In [83]:
# Normalize numerical columns (Min-Max Scaling)
scaler = MinMaxScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [84]:
# Encode categorical values
encoder = LabelEncoder()
data["Heart Attack Risk"] = encoder.fit_transform(data["Heart Attack Risk"])

In [85]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [86]:
# Print class distribution
print("Class Distribution:")
print(data["Heart Attack Risk"].value_counts())

Class Distribution:
Heart Attack Risk
2    20000
0    17500
1    12500
Name: count, dtype: int64


In [87]:
data.to_csv("synthetic_heart_attack_data_balanced_preprocess.csv", index=False)