In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math, copy

In [3]:
# Set seed for reproducibility
np.random.seed(42)

# Number of samples
n = 500

# Generate features with realistic value ranges
age = np.random.randint(21, 81, n)  # 21 to 80 years
bmi = np.random.normal(30, 6, n)  # average BMI around 30 ± 6
blood_pressure = np.random.normal(75, 10, n)  # diastolic BP ~ 75 ± 10
blood_sugar = np.random.normal(120, 30, n)  # fasting blood sugar ~ 120 ± 30
insulin = np.random.normal(80, 25, n)  # insulin levels ~ 80 ± 25
skin_thickness = np.random.normal(25, 8, n)  # skin fold thickness ~ 25 ± 8
pregnancies = np.random.poisson(2, n)  # count variable

# Combine features into a DataFrame
df = pd.DataFrame({
    'Age': age,
    'BMI': bmi,
    'BloodPressure': blood_pressure,
    'BloodSugar': blood_sugar,
    'Insulin': insulin,
    'SkinThickness': skin_thickness,
    'Pregnancies': pregnancies
})

# Simulate a target variable using a logistic function
# (this is artificial, but helps for learning)
z = (
    0.03 * age +
    0.2 * bmi +
    0.05 * blood_sugar +
    0.02 * insulin +
    0.1 * pregnancies -
    20  # bias term to keep probability in a reasonable range
)
probability = 1 / (1 + np.exp(-z))
diabetes = (probability > 0.5).astype(int)

df['Diabetes'] = diabetes

# Show first 5 rows
df.head()

Unnamed: 0,Age,BMI,BloodPressure,BloodSugar,Insulin,SkinThickness,Pregnancies,Diabetes
0,59,33.090286,54.180706,77.921842,76.684156,22.495536,3,0
1,72,53.116389,91.964564,172.487302,55.636767,24.994331,2,1
2,49,33.425343,77.110175,82.684103,107.677017,14.996738,3,0
3,35,36.813394,74.032869,99.212844,76.990471,29.836123,6,0
4,63,35.724011,69.550809,98.447782,25.683261,32.058664,3,0
