In [7]:
import pandas as pd
import random

# Define ranges and options
ages = random.sample(list(range(20, 81, 5)), 7)  # Sample 7 ages
genders = [0, 1]  # 0=Male, 1=Female (numeric)
heights = random.sample(list(range(100, 251, 10)), 10)  # 10 heights
weights = random.sample(list(range(30, 301, 10)), 10)  # 10 weights
smoking_options = [0, 1]
time_of_smoking_options = [0, 10, 20, 30, 40]
frequency_of_smoking_options = [0, 5, 10, 15, 20]
health_options = [0, 1, 2]  # 0=No, 1=Yes, 2=Not sure
chest_pain_options = [0, 1, 2, 3]  # Never, Rarely, Sometimes, Often
chest_pain_severity_options = [0, 1, 2, 3, 4]  # Mild to Severe
short_breath_options = [0, 1, 2, 3]  # Never, Rarely, Sometimes, Often
short_breath_duration_options = [0, 1, 2, 3, 4]  # 0 min to 31-60 min
exercise_options = [0, 1, 2, 3]  # Never, Rarely, Sometimes, Regularly
fatty_food_options = [0, 1, 2, 3]  # Rarely to Very Often
stress_options = [0, 1, 2, 3]  # Rarely to Very Often

# Generate 20,000 random rows
data = []
for _ in range(20000):
    age = random.choice(ages)
    gender = random.choice(genders)  # Numeric 0 or 1
    height = random.choice(heights)
    weight = random.choice(weights)
    smoke = random.choice(smoking_options)
    time_of_smoking = random.choice(time_of_smoking_options) if smoke == 1 else 0
    frequency_of_smoking = random.choice(frequency_of_smoking_options) if smoke == 1 else 0
    hypertension = random.choice(health_options)
    diabetes = random.choice(health_options)
    high_cholesterol = random.choice(health_options)
    family_history = random.choice(health_options)
    chest_pain = random.choice(chest_pain_options)
    chest_pain_severity = random.choice(chest_pain_severity_options)
    short_breath = random.choice(short_breath_options)
    short_breath_duration = random.choice(short_breath_duration_options)
    exercise = random.choice(exercise_options)
    fatty_food = random.choice(fatty_food_options)
    stress = random.choice(stress_options)

    # Debug print to verify gender (remove after testing)
    # print(f"Gender assigned: {gender}")

    # Calculate BMI with safeguard
    height_m = height / 100
    if height_m > 0:
        bmi = weight / (height_m ** 2)
    else:
        bmi = 0  # Default to 0 if height is invalid (shouldn't occur)

    score = 0

    # Age-based risk (Framingham approximation)
    if gender == 0:  # Male
        if 30 <= age < 34: score += 0
        elif 35 <= age < 39: score += 1
        elif 40 <= age < 44: score += 2
        elif 45 <= age < 49: score += 3
        elif 50 <= age < 54: score += 4
        elif 55 <= age < 59: score += 5
        elif 60 <= age < 64: score += 6
        elif 65 <= age < 69: score += 7
        elif 70 <= age <= 74: score += 8
    else:  # Female (gender == 1)
        if 30 <= age < 34: score += 0
        elif 35 <= age < 39: score += 0
        elif 40 <= age < 44: score += 1
        elif 45 <= age < 49: score += 2
        elif 50 <= age < 54: score += 3
        elif 55 <= age < 59: score += 4
        elif 60 <= age < 64: score += 5
        elif 65 <= age < 69: score += 6
        elif 70 <= age <= 74: score += 7

    # Smoking risk (pack-years approximation)
    if smoke == 1:
        pack_years = (time_of_smoking * frequency_of_smoking) / 20
        if pack_years > 20: score += 3
        elif pack_years > 10: score += 2
        elif pack_years > 0: score += 1

    # Health conditions
    if hypertension == 1: score += 2  # Moderate hypertension
    if diabetes == 1: score += 3  # Significant risk
    if high_cholesterol == 1: score += 2  # Elevated LDL
    if family_history == 1: score += 2  # Premature heart disease

    # BMI risk
    if bmi >= 30: score += 1  # Obesity only

    # Symptoms (modest weight as warning signs)
    if chest_pain >= 2 and chest_pain_severity >= 2:  # Sometimes + Moderate or worse
        score += 1  # Flag for potential angina
    if short_breath >= 2 and short_breath_duration >= 2:  # Sometimes + 6-15 min or more
        score += 1  # Flag for possible heart failure

    # Lifestyle
    if exercise == 3: score -= 2  # Regular exercise benefit
    if fatty_food >= 2: score += 1  # Often or Very Often
    if stress >= 2: score += 1  # Often or Very Often

    # Append row with explicit type check for gender
    data.append([
        age, int(gender), height, weight, round(bmi, 2), smoke,
        time_of_smoking, frequency_of_smoking, hypertension,
        diabetes, high_cholesterol, family_history,
        chest_pain, chest_pain_severity, short_breath,
        short_breath_duration, exercise, fatty_food, stress,
        score
    ])

# Define columns
columns = [
    "Age", "Gender", "Height", "Weight", "BMI", "Smoke", "Time_of_Smoking",
    "Frequency_of_smoking", "High_Blood_Pressure", "Diabetes", "High_Cholesterol",
    "Family_History", "Chest_Pain", "Chest_Pain_Severity", "Short_Breath",
    "Short_Breath_Duration", "Exercise", "Fatty_Food", "Stress", "Score"
]

df = pd.DataFrame(data, columns=columns)

# Check for NaNs before proceeding
if df.isna().any().any():
    print("Warning: NaNs detected in the dataset. Filling with 0.")
    df = df.fillna(0)
    print("NaN counts after filling:")
    print(df.isna().sum())

# Sort by score
df = df.sort_values(by="Score").reset_index(drop=True)

# Set 50% of data as Result = 0 (healthy), 50% as Result = 1 (unhealthy)
half = len(df) // 2
df["Result"] = [0] * half + [1] * (len(df) - half)

# Final cleanup
df.drop("Score", axis=1, inplace=True)

from sklearn.utils import shuffle
df = shuffle(df).reset_index(drop=True)

# Save the dataset
df.to_csv("medical_dataset.csv", index=False)

print("✅ Medically balanced dataset created:")
print("Healthy (0):", len(df[df['Result'] == 0]))
print("Unhealthy (1):", len(df[df['Result'] == 1]))
print("Total:", len(df))

✅ Medically balanced dataset created:
Healthy (0): 10000
Unhealthy (1): 10000
Total: 20000
