In [17]:
import numpy as np
import pandas as pd

In [18]:
np.random.seed(42)

In [19]:
num_rows = 50000

In [32]:
def generate_conditional_data(num_rows):
    yes_percentage = 0.40  # 40% for "Yes"
    no_percentage = yes_percentage - 0.05  # "No" will have 5% less
    remaining_percentage = 1 - (yes_percentage + no_percentage)  # Remaining for "N/A"

    class_counts = {
        'Yes': int(yes_percentage * num_rows),
        'No': int(no_percentage * num_rows),
        'N/A': int(remaining_percentage * num_rows)
    }

    data = []
    counts = {'Yes': 0, 'No': 0, 'N/A': 0}

    while sum(counts.values()) < num_rows:
        device_status = np.random.choice(["Online", "Offline"])  

        # Assign "N/A" immediately if Offline and still needed
        if device_status == "Offline" and counts['N/A'] < class_counts['N/A']:
            data.append({
                "Device_Status": device_status,
                "Heart_Rate_BPM": np.nan,
                "HRV_ms": np.nan,
                "SpO2_Percentage": np.nan,
                "Skin_Temperature_C": np.nan,
                "EDA_GSR": np.nan,
                "Ambient_Temperature_C": np.nan,
                "Humidity_Percentage": np.nan,
                "IMU_X": np.nan,
                "IMU_Y": np.nan,
                "IMU_Z": np.nan,
                "Stress_Level": np.nan,
                "Heart Attack Risk": "N/A"
            })
            counts['N/A'] += 1
            continue  # Skip further processing for Offline devices

        # Generate realistic physiological & environmental parameters
        hr = np.random.uniform(60, 120)  # Normal to elevated heart rate
        spo2 = np.random.uniform(90, 100)  # Healthy oxygen levels
        hrv = np.random.uniform(30, 100)  # Heart rate variability (higher is better)
        temperature = np.random.uniform(35.5, 37.8)  # Normal body temperature
        eda_gsr = np.random.uniform(0.2, 10.0)  # Electrodermal activity (stress-related)
        ambient_temp = np.random.uniform(20.0, 35.0)  # Normal room temperature
        humidity = np.random.uniform(30, 80)  # Normal humidity range

        # IMU motion activity (3-axis acceleration, simulating body movement)
        imu_x = np.random.uniform(-3, 3)
        imu_y = np.random.uniform(-3, 3)
        imu_z = np.random.uniform(-3, 3)

        # Calculate stress level (HRV-based with EDA influence)
        stress_level = (100 - hrv) * 0.5 + (eda_gsr * 10)

        # Assign "Heart Attack Risk" while ensuring balance
        if counts['Yes'] < class_counts['Yes']:
            heart_attack_risk = 'Yes'
            counts['Yes'] += 1
        elif counts['No'] < class_counts['No']:
            heart_attack_risk = 'No'
            counts['No'] += 1
        else:
            heart_attack_risk = 'N/A'
            counts['N/A'] += 1  # This should rarely happen now

        data.append({
            "Device_Status": device_status,
            "Heart_Rate_BPM": hr,
            "HRV_ms": hrv,
            "SpO2_Percentage": spo2,
            "Skin_Temperature_C": temperature,
            "EDA_GSR": eda_gsr,
            "Ambient_Temperature_C": ambient_temp,
            "Humidity_Percentage": humidity,
            "IMU_X": imu_x,
            "IMU_Y": imu_y,
            "IMU_Z": imu_z,
            "Stress_Level": stress_level,
            "Heart Attack Risk": heart_attack_risk
        })

    return pd.DataFrame(data)

In [33]:
data = generate_conditional_data(num_rows)

In [34]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [35]:
print("Final Class Distribution:")
print(data['Heart Attack Risk'].value_counts())

Final Class Distribution:
Heart Attack Risk
Yes    20000
No     17500
N/A    12500
Name: count, dtype: int64


In [36]:
# Save the dataset
data.to_csv('synthetic_heart_attack_data_balanced.csv', index=False)
print("\nDataset saved as 'synthetic_heart_attack_data_balanced.csv'.")


Dataset saved as 'synthetic_heart_attack_data_balanced.csv'.
