In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)

# Total samples
num_samples = 1000
half_samples = num_samples // 2  # 500 positive, 500 negative

# Generate features
ages = np.random.randint(5, 19, size=num_samples)
heights = np.round(np.random.normal(loc=140, scale=15, size=num_samples), 1)
weights = np.round(np.random.normal(loc=40, scale=12, size=num_samples), 1)

# Clip height & weight to realistic ranges
heights = np.clip(heights, 100, 180)
weights = np.clip(weights, 15, 80)

# BMI calculation
bmis = np.round(weights / ((heights / 100) ** 2), 1)

# Generate glucose, insulin, and BP
glucoses = np.random.randint(70, 160, size=num_samples)
insulins = np.random.randint(20, 180, size=num_samples)
blood_pressures = np.random.randint(90, 130, size=num_samples)

# Activity and family history
activity_levels = np.random.choice(['Low', 'Medium', 'High'], size=num_samples)
family_histories = np.random.choice([0, 1], size=num_samples)

# Sex
sexes = np.random.choice(['Male', 'Female'], size=num_samples)

# Assign diabetes risk using deterministic + random noise
diabetes_risks = []
for i in range(num_samples):
    score = 0
    # Higher glucose, BMI, and family history increase score
    if glucoses[i] > 120: score += 1
    if bmis[i] > 22: score += 1
    if family_histories[i] == 1: score += 1
    if activity_levels[i] == 'Low': score += 1
    if blood_pressures[i] > 120: score += 1

    # Assign 1 if score is high enough or to maintain balance
    if len([r for r in diabetes_risks if r == 1]) < half_samples:
        diabetes_risks.append(1 if score >= 2 else 0)
    else:
        diabetes_risks.append(0)

# Shuffle the dataset for randomness
df = pd.DataFrame({
    'Age': ages,
    'Sex': np.where(sexes == 'Male', 0, 1),
    'Height_cm': heights,
    'Weight_kg': weights,
    'BMI': bmis,
    'Glucose': glucoses,
    'Insulin': insulins,
    'BloodPressure': blood_pressures,
    'PhysicalActivityLevel': [ {'Low': 0, 'Medium': 1, 'High': 2}[a] for a in activity_levels ],
    'FamilyHistory': family_histories,
    'DiabetesRisk': diabetes_risks
}).sample(frac=1, random_state=42).reset_index(drop=True)

# Save dataset
df.to_csv("balanced_pediatric_diabetes_dataset.csv", index=False)
print(f"Dataset created with {df.shape[0]} rows and saved as 'balanced_pediatric_diabetes_dataset.csv'")
print(df['DiabetesRisk'].value_counts())


Dataset created with 1000 rows and saved as 'balanced_pediatric_diabetes_dataset.csv'
DiabetesRisk
0    500
1    500
Name: count, dtype: int64
