In [2]:
# heart_disease_synthetic.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, classification_report
)
import joblib

def generate_synthetic_heart_data(n_samples=20000, random_state=42):
    """
    Generates synthetic heart disease data with biologically inspired distributions.
    """
    np.random.seed(random_state)

    age = np.random.randint(29, 78, size=n_samples)
    sex = np.random.binomial(1, 0.54, size=n_samples)
    trestbps = np.random.normal(130, 20, size=n_samples).clip(80, 200)
    chol = np.random.normal(246, 50, size=n_samples).clip(100, 400)
    fbs = np.random.binomial(1, 0.15, size=n_samples)
    restecg = np.random.choice([0, 1, 2], size=n_samples, p=[0.5, 0.42, 0.08])
    thalach = np.random.normal(150, 22, size=n_samples).clip(70, 210)
    exang = np.random.binomial(1, 0.33, size=n_samples)
    oldpeak = np.random.normal(1.0, 1.0, size=n_samples).clip(0.0, 6.0)
    slope = np.random.choice([0, 1, 2], size=n_samples)
    ca = np.random.poisson(1.0, size=n_samples).clip(0, 3)
    thal = np.random.choice([1, 2, 3], size=n_samples, p=[0.54, 0.28, 0.18])

    df = pd.DataFrame({
        'age': age,
        'sex': sex,
        'trestbps': trestbps,
        'chol': chol,
        'fbs': fbs,
        'restecg': restecg,
        'thalach': thalach,
        'exang': exang,
        'oldpeak': oldpeak,
        'slope': slope,
        'ca': ca,
        'thal': thal
    })

    # Simple risk function to generate labels
    risk_score = (
        0.02 * age +
        0.01 * trestbps +
        0.005 * chol +
        0.5 * exang +
        0.3 * ca -
        0.01 * thalach +
        np.random.normal(0, 1.5, size=n_samples)
    )

    # Convert risk to probability and then binary label
    prob = 1 / (1 + np.exp(- (risk_score - np.median(risk_score)) / 5))
    df['target'] = (prob > 0.5).astype(int)

    return df

def train_and_evaluate(df):
    """
    Trains a logistic regression model and evaluates performance.
    Saves the model and scaler to disk.
    """
    X = df.drop(columns='target')
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1, stratify=y
    )

    # Scale numerical features
    scaler = StandardScaler()
    num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    # One-hot encode categorical features
    X_train = pd.get_dummies(X_train, columns=['sex', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['sex', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'], drop_first=True)

    # Ensure same columns
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy:       ", accuracy_score(y_test, y_pred))
    print("ROC AUC:        ", roc_auc_score(y_test, y_proba))
    print("Precision:      ", precision_score(y_test, y_pred))
    print("Recall:         ", recall_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Save model and scaler
    joblib.dump(model, 'heart_disease_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    print("\nModel and scaler saved to disk.")

if __name__ == "__main__":
    df = generate_synthetic_heart_data(n_samples=20000)
    print("Sample of generated data:")
    print(df.head())

    train_and_evaluate(df)


Sample of generated data:
   age  sex    trestbps        chol  fbs  restecg     thalach  exang  \
0   67    1  161.422676  184.774113    0        0  152.399155      1   
1   57    0  126.550049  326.584251    1        1  103.227918      0   
2   43    1  146.891063  202.569424    0        1  168.658125      1   
3   71    1  134.556393  236.731622    1        0  110.976761      0   
4   36    1  138.488759  314.857487    1        0   93.606141      1   

    oldpeak  slope  ca  thal  target  
0  1.250268      2   2     3       1  
1  1.430917      1   0     1       1  
2  1.257099      0   1     1       0  
3  0.000000      2   2     2       1  
4  0.000000      0   0     1       0  
Accuracy:        0.624
ROC AUC:         0.67395725
Precision:       0.6241241241241241
Recall:          0.6235

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.62      0.62      2000
           1       0.62      0.62      0.62      2000

    acc