In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import joblib

# Load and clean data
def load_data():
    try:
        df = pd.read_csv('heart.csv')
    except:
        df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',
                        header=None)
        columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                   'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
        df.columns = columns

    # Clean data
    df = df.replace('?', np.nan)
    for col in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca', 'thal']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing values
    num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Convert target to binary
    df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

    return df

# Train and save model
def train_and_save_model():
    df = load_data()

    # Select features and target
    X = df.drop('target', axis=1)
    y = df['target']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Decision Tree
    dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt_model.fit(X_train, y_train)

    # Evaluate
    y_pred = dt_model.predict(X_test)
    y_proba = dt_model.predict_proba(X_test)[:, 1]

    print("Model Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(dt_model, 'decision_tree_model.joblib')
    print("\nModel saved as 'decision_tree_model.joblib'")

if __name__ == "__main__":
    train_and_save_model()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Model Evaluation:
Accuracy: 0.7213
ROC AUC: 0.7161

Confusion Matrix:
[[22  7]
 [10 22]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.76      0.72        29
           1       0.76      0.69      0.72        32

    accuracy                           0.72        61
   macro avg       0.72      0.72      0.72        61
weighted avg       0.72      0.72      0.72        61


Model saved as 'decision_tree_model.joblib'
