In [2]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(42)

In [4]:
num_rows = 50000

In [5]:
yes_percentage = 0.49
no_percentage = 1- yes_percentage

class_counts = {
    'Yes': int(yes_percentage * num_rows),
    'No': int(no_percentage * num_rows),
}

In [6]:
def generate_conditional_data(class_counts):
    data = []
    
    # Track how many of each class we've added
    counts = {'Yes': 0, 'No': 0}

    while len(data) < sum(class_counts.values()):
        hr = np.random.uniform(60, 100)
        spo2 = np.random.uniform(90, 100)
        prv = np.random.uniform(40, 100)
        temperature = np.random.uniform(35.5, 38.5)
        device_status = 'Online'

        # Determine heart attack risk
        if (hr > 85) and (spo2 < 95) and (prv < 50) and (temperature > 37.5) and (counts['Yes'] < class_counts['Yes']):
            heart_attack_risk = 'Yes'
            gsr = np.random.uniform(10, 25)  # Higher stress
            counts['Yes'] += 1
        elif counts['No'] < class_counts['No']:  # Default all other cases to "No"
            heart_attack_risk = 'No'
            gsr = np.random.uniform(2, 10)  # Normal stress levels
            counts['No'] += 1
        else:
            continue  # Skip if we have already filled the required counts
        
        # Append the generated row
        data.append({
            'HR (BPM)': hr,
            'SpO2 (%)': spo2,
            'PRV (ms)': prv,
            'Skin Temperature (°C)': temperature,
            'GSR (µS)': gsr,
            'MAX30102 Status': device_status,
            'TMP117 Status': device_status,
            'GSR Status': device_status,
            'Heart Attack Risk': heart_attack_risk
        })

    return pd.DataFrame(data)

In [7]:
data = generate_conditional_data(class_counts)

In [8]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
print("Final Class Distribution:")
print(data['Heart Attack Risk'].value_counts())

Final Class Distribution:
Heart Attack Risk
No     25500
Yes    24500
Name: count, dtype: int64


In [10]:
# Save the dataset
data.to_csv('synthetic_heart_attack_data_balanced.csv', index=False)
print("\nDataset saved as 'synthetic_heart_attack_data_balanced.csv'.")


Dataset saved as 'synthetic_heart_attack_data_balanced.csv'.
