In [None]:
# For classic ML models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# For deep learning models
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the dataset
data = pd.read_csv('data_cleaned.csv')

# Display the first few rows to understand the structure
data.head()

In [None]:
# 模型评估函数
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # 生成混淆矩阵和分类报告
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, zero_division=1)

    # 输出结果
    print(f'{model_name} Accuracy: {accuracy:.2f}')
    print(f'{model_name} Classification Report:\n{class_report}')

    # 绘制混淆矩阵
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['No Heart Disease', 'Heart Disease'])
    disp.plot()
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()

    # 计算 AUC-ROC
    if hasattr(model, "predict_proba"):
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        print(f"{model_name} AUC-ROC: {roc_auc:.2f}")

    return accuracy

In [None]:
# 绘制 PR 曲线
def plot_pr_curve(model, X_test, y_test, model_name):
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    plt.plot(recall, precision, label=f"{model_name}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve for {model_name}")
    plt.legend()
    plt.show()

In [None]:
# GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print("Gradient Boosting Results:")
gbc_accuracy = evaluate_model(gbc, X_test, y_test, 'Gradient Boosting Classifier')
plot_pr_curve(gbc, X_test, y_test, "Gradient Boosting")
# print(f'GradientBoostingClassifier Accuracy: {gbc_accuracy}')


# XGBoost
import xgboost as xgb
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgboost_model.fit(X_train, y_train)

xgboost_accuracy = evaluate_model(xgboost_model, X_test, y_test, 'XGBoost Classifier')
plot_pr_curve(xgboost_model, X_test, y_test, "XGBoost")
# print(f'XGBoost Classifier Accuracy: {xgboost_accuracy}')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

param_dist = {
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 64, 128, 256],
    'max_depth': [5, 10, 15, 20],
    'min_data_in_leaf': [10, 50, 100, 200],
    'bagging_fraction': [0.5, 0.7, 0.9],
    'feature_fraction': [0.5, 0.7, 0.9],
    'reg_alpha': [0.1, 0.5, 1, 5],
    'reg_lambda': [0.1, 0.5, 1, 5]
}

model = lgb.LGBMClassifier()
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=20, scoring='accuracy', cv=5, random_state=42)
random_search.fit(X_train, y_train)

print("最佳参数:", random_search.best_params_)

In [None]:
# LightGBM
import lightgbm as lgb

lgbm_params = {
    'subsample': 0.95, 
    'reg_lambda': 0.1, 
    'reg_alpha': 0.5, 
    'n_estimators': 550, 
    'min_data_in_leaf': 50, 
    'max_depth': 15, 
    'learning_rate': 0.05, 
    'feature_fraction': 0.7, 
    'colsample_bytree': 0.9, 
}

lightgbm_model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1)
lightgbm_model.fit(X_train, y_train)
lightgbm_accuracy = evaluate_model(lightgbm_model, X_test, y_test, 'LightGBM Classifier')
plot_pr_curve(lightgbm_model, X_test, y_test, "LightGBM")
#print(f'LightGBM Classifier Accuracy: {lightgbm_accuracy}')