In [19]:
import numpy as np
import pandas as pd

In [20]:
np.random.seed(42)

In [21]:
num_rows = 50000

In [22]:
yes_percentage = 0.38
no_percentage = 0.37
remaining_percentage = 1 - (yes_percentage + no_percentage)  # Remaining for "N/A"

class_counts = {
    'Yes': int(yes_percentage * num_rows),
    'No': int(no_percentage * num_rows),
    'N/A': int(remaining_percentage * num_rows)
}

In [23]:
def generate_conditional_data(class_counts):
    data = []

    # Generate 'Yes' cases (Heart Attack Risk)
    for _ in range(class_counts['Yes']):
        max30102_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])
        tmp117_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])
        gsr_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])

        hr = np.random.normal(90, 5) if max30102_status == "Online" else 0
        spo2 = np.random.normal(93, 1.5) if max30102_status == "Online" else 0
        prv = np.random.normal(45, 5) if max30102_status == "Online" else 0
        temperature = np.random.normal(38, 0.5) if tmp117_status == "Online" else 0
        gsr_value = np.random.uniform(2.0, 10.0) if gsr_status == "Online" else 0

        data.append([hr, spo2, prv, temperature, gsr_value, 
                     max30102_status, tmp117_status, gsr_status, 'Yes'])

    # Generate 'No' cases (Healthy)
    for _ in range(class_counts['No']):
        max30102_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])
        tmp117_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])
        gsr_status = np.random.choice(["Online", "Offline"], p=[0.9, 0.1])

        hr = np.random.normal(75, 5) if max30102_status == "Online" else 0
        spo2 = np.random.normal(98, 1) if max30102_status == "Online" else 0
        prv = np.random.normal(60, 5) if max30102_status == "Online" else 0
        temperature = np.random.normal(36.8, 0.3) if tmp117_status == "Online" else 0
        gsr_value = np.random.uniform(0.5, 5.0) if gsr_status == "Online" else 0

        data.append([hr, spo2, prv, temperature, gsr_value, 
                     max30102_status, tmp117_status, gsr_status, 'No'])

    # Generate 'N/A' cases (All Sensors Offline → All values = 0)
    for _ in range(class_counts['N/A']):
        data.append([0, 0, 0, 0, 0, "Offline", "Offline", "Offline", 'N/A'])

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=[
        'HR (BPM)', 'SpO2 (%)', 'PRV (ms)', 'Temperature (°C)', 'GSR (µS)',
        'MAX30102 Status', 'TMP117 Status', 'GSR Status', 'Heart Attack Risk'
    ])

    return df

In [24]:
data = generate_conditional_data(class_counts)

In [25]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
print("Final Class Distribution:")
print(data['Heart Attack Risk'].value_counts())

Final Class Distribution:
Heart Attack Risk
Yes    19000
No     18500
N/A    12500
Name: count, dtype: int64


In [27]:
# Save the dataset
data.to_csv('synthetic_heart_attack_data_balanced.csv', index=False)
print("\nDataset saved as 'synthetic_heart_attack_data_balanced.csv'.")


Dataset saved as 'synthetic_heart_attack_data_balanced.csv'.
