In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("heart.csv")
print("Shape:", df.shape)
print(df.head())

print("\nDataset info:")
print(df.info())
print("\nMissing values per column:")
print(df.isna().sum())

Shape: (918, 12)
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          ----

## Mock data

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from typing import Dict, Any


def generate_features_X(n_rows: int = 500, seed: int = 42, spec: Dict[str, Any] = None) -> pd.DataFrame:
    """
    Generates independent features (X), applies One-Hot Encoding, and 
    Standard Scaling to return a fully processed feature matrix.
    """
    DEFAULT_SPEC = {
        "Age":          {"dtype": "int",   "low": 29,  "high": 77},
        "Sex":          {"dtype": "cat",   "values": ["M", "F"], "probs": [0.6, 0.4]},
        "ChestPainType":{"dtype": "cat",   "values": ["TA", "ATA", "NAP", "ASY"]},
        "RestingBP":    {"dtype": "int",   "low": 90,  "high": 200},
        "Cholesterol":  {"dtype":"int",   "low": 100, "high": 400},
        "FastingBS":    {"dtype":"cat",   "values": [0, 1], "probs":[0.8, 0.2]},
        "RestingECG":   {"dtype":"cat",   "values":["Normal","ST","LVH"]},
        "MaxHR":        {"dtype":"int",   "low": 71,  "high": 202},
        "ExerciseAngina":{"dtype":"cat", "values":["N","Y"],"probs":[0.7,0.3]},
        "Oldpeak":      {"dtype":"float", "low": 0.0, "high": 6.2},
        "ST_Slope":     {"dtype":"cat",   "values":["Up","Flat","Down"]},
    }
    
    np.random.seed(seed)
    if spec is None:
        spec = DEFAULT_SPEC

    data = {}
    for col, cfg in spec.items():
        dt = cfg.get("dtype")
        if dt == "int":
            data[col] = np.random.randint(cfg["low"], cfg["high"], size=n_rows)
        elif dt == "float":
            data[col] = np.random.uniform(cfg["low"], cfg["high"], size=n_rows)
        elif dt == "cat":
            vals = np.array(cfg["values"])
            probs = cfg.get("probs", np.repeat(1/len(vals), len(vals)))
            data[col] = np.random.choice(vals, size=n_rows, p=probs)
    
    df_raw = pd.DataFrame(data)


    # Processing (One-Hot Encoding + Standard Scaling)
    X_processed = pd.get_dummies(df_raw, drop_first=True, dtype=int)
    
    numerical_cols = [col for col, cfg in spec.items() if cfg['dtype'] in ['int', 'float']]
    numerical_cols = [col for col in numerical_cols if col in X_processed.columns]
    
    if numerical_cols:
        scaler = StandardScaler()
        X_processed[numerical_cols] = scaler.fit_transform(X_processed[numerical_cols])

    return X_processed


def generate_target_y(X_processed: pd.DataFrame, seed: int = 42, weights: Dict[str, float] = None) -> pd.Series:
    """
    Generates the HeartDisease target (y) based on a weighted, logistic 
    combination of the processed features (X).
    """
    # How much (and in which direction) each feature affects heart disease risk
    DEFAULT_WEIGHTS = {
        "Age": 0.5, "RestingBP": 0.01, "Cholesterol": 0.005, 
        "FastingBS_1": 0.8, "ExerciseAngina_Y": 3.0, 
        "Oldpeak": 0.3, "ST_Slope_Flat": 3, "ST_Slope_Up": -2,
        "Sex_F": -0.5, "MaxHR": -0.5, "ChestPainType_ATA": -0.7,
        "ChestPainType_NAP": -0.4, "ChestPainType_TA": -0.005,
        "RestingECG_Normal": 0.01, "RestingECG_ST": 0.4,
    }
    
    np.random.seed(seed)
    if weights is None:
        weights = DEFAULT_WEIGHTS

    n_rows = len(X_processed)

    linear_predictor = np.zeros(n_rows)
    intercept = -1.0 # Tune to adjust mean of output
    linear_predictor += intercept

    for feature, weight in weights.items():
        if feature in X_processed.columns:
            linear_predictor += X_processed[feature].values * weight
    
    p_heart_disease = 1 / (1 + np.exp(-linear_predictor))
    
    noise_strength = 0.05
    p_heart_disease = np.clip(p_heart_disease + np.random.uniform(-noise_strength, noise_strength, n_rows), 0, 1)

    y_target = pd.Series(np.where(p_heart_disease >= 0.5, 1, 0), name='HeartDisease')
    
    return y_target

In [3]:
from sklearn.model_selection import train_test_split

X = generate_features_X(n_rows=1000)
y = generate_target_y(X)
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(np.mean(y_train))

0.4775


## Classifier

In [4]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.945


In [5]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.935
