In [2]:
import pandas as pd
import itertools
import random

# Define binary options
binary_options = [0, 1]

# Reduced age and gender
ages = list(range(20, 81, 10))  # Reduced to 20, 30, 40, 50, 60, 70, 80 (7 values)
genders = ['Male', 'Female']

# Binary health feature names
columns_binary = [
    "High_Blood_Pressure", "Diabetes", "High_Cholesterol", "Family_History",
    "Chest_Pain", "Short_Breath", "Exercise", "Fatty_Food", "Stress"
]

# Severity/duration options
chest_pain_severity_options = list(range(0, 5))  # 0-4
short_breath_duration_options = list(range(0, 5))  # 0-4

# Reduced smoking ranges
time_of_smoking_options = [0, 10, 20, 30, 40]  # 5 values
frequency_of_smoking_options = [0, 5, 10, 15, 20]  # 5 values

# Sample a subset of binary combinations (e.g., 100 instead of 512)
binary_combinations = random.sample(list(itertools.product(binary_options, repeat=len(columns_binary))), 100)

# Data storage
data = []

for age in ages:
    for gender in genders:
        for smoke in binary_options:
            for time_of_smoking in time_of_smoking_options:
                for frequency_of_smoking in frequency_of_smoking_options:
                    for comb in binary_combinations:
                        for chest_pain_severity in chest_pain_severity_options:
                            for short_breath_duration in short_breath_duration_options:
                                row = list(comb)
                                score = 0

                                # Age-based risk
                                if age >= 70:
                                    score += 3
                                elif age >= 60:
                                    score += 2
                                elif age >= 45:
                                    score += 1

                                # Gender-based risk
                                if gender == "Male":
                                    score += 1

                                # Height & Weight
                                height_cm = random.randint(145, 200)
                                weight_kg = random.randint(40, 130)
                                height_m = height_cm / 100
                                bmi = weight_kg / (height_m ** 2)

                                # BMI-based risk
                                if bmi < 18.5:
                                    score += 1
                                elif 25 <= bmi < 30:
                                    score += 1
                                elif bmi >= 30:
                                    score += 2

                                # Smoking logic
                                if smoke == 1:
                                    if time_of_smoking > 30:
                                        score += 2
                                    elif time_of_smoking > 10:
                                        score += 1
                                    if frequency_of_smoking > 10:
                                        score += 1
                                else:
                                    time_of_smoking = 0
                                    frequency_of_smoking = 0

                                # Additional health feature risks
                                high_bp, diabetes, high_chol, fam_hist, chest_pain, short_breath, exercise, fatty_food, stress = row
                                if high_bp == 1:
                                    score += 1
                                if diabetes == 1:
                                    score += 1
                                if high_chol == 1:
                                    score += 1
                                if fam_hist == 1:
                                    score += 1

                                # Exercise as protective factor
                                if exercise == 1:
                                    score -= 1
                                elif exercise == 2:
                                    score -= 2
                                elif exercise == 3:
                                    score -= 3

                                # Fatty food risk
                                if fatty_food == 1:
                                    score += 1
                                elif fatty_food == 2:
                                    score += 2
                                elif fatty_food == 3:
                                    score += 3

                                # Stress risk
                                if stress == 1:
                                    score += 1
                                elif stress == 2:
                                    score += 2
                                elif stress == 3:
                                    score += 3

                                # Add risk for chest pain severity and short breath duration
                                score += chest_pain_severity
                                score += short_breath_duration

                                data.append([
                                    age, gender, height_cm, weight_kg, bmi, smoke, time_of_smoking, frequency_of_smoking
                                ] + row + [
                                    chest_pain_severity, short_breath_duration, score
                                ])

# Define columns
columns = [
    "Age", "Gender", "Height", "Weight", "BMI", "Smoke", "Time_of_Smoking", "Frequency_of_Smoking",
    "High_Blood_Pressure", "Diabetes", "High_Cholesterol", "Family_History",
    "Chest_Pain", "Chest_Pain_Severity", "Short_Breath", "Short_Breath_Duration",
    "Exercise", "Fatty_Food", "Stress", "Score"
]

df = pd.DataFrame(data, columns=columns)

# Sort by score
df = df.sort_values(by="Score").reset_index(drop=True)

# Limit to 100,000 rows (50,000 healthy, 50,000 unhealthy)
df = df.head(20000)

# Set 50% of data as Result = 0 (healthy), 50% as Result = 1 (unhealthy)
half = len(df) // 2
df["Result"] = [0] * half + [1] * (len(df) - half)

# Final cleanup
df.drop("Score", axis=1, inplace=True)

# Save reduced balanced dataset
df.to_csv("final_balanced_reduced_dataset.csv", index=False)

print("✅ Reduced balanced dataset created:")
print("Healthy (0):", len(df[df['Result'] == 0]))
print("Unhealthy (1):", len(df[df['Result'] == 1]))
print("Total:", len(df))

✅ Reduced balanced dataset created:
Healthy (0): 10000
Unhealthy (1): 10000
Total: 20000
