In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, f1_score

In [2]:
df = pd.read_excel('../data/processed/df_prepared.xlsx')

In [3]:
df['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)'].value_counts()

FU1+2+3 coronary event (Death+AMI+ACS+Revasc)
0    159
1     51
Name: count, dtype: int64

In [4]:
X = df.drop(columns=['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)'])
y = df['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)']

In [9]:
pipe_l1 = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        penalty="l1",
        solver="liblinear",  
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ))
])

param_grid = {
    "logreg__C": np.logspace(-3, 1, 15)
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scorer = make_scorer(f1_score)

grid_l1 = GridSearchCV(
    estimator=pipe_l1,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    refit=True
)

grid_l1.fit(X, y)

print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 10.0
Лучший F1 (CV): 0.46672904483430794


LVEDV rest                   7.781066
LVEDVi at rest              -7.491336
Weight                      -6.384917
Body mass index              3.044573
LVESVi at rest               2.488959
LVESV  rest                 -2.020454
Height                       1.693633
Cholesterol                  1.091242
LPLD_Ch                     -0.970976
ESC Pre-test Probability     0.827847
LVEF% at rest                0.669067
Dyslipidemia                 0.468955
Obesity                      0.394543
Peripheral artery disease    0.362888
Hypertension                 0.342336
RVSP rest                   -0.304068
LPHD_Ch                     -0.253665
LAVI rest                    0.240942
Carotid artery disease       0.239834
Creatinine clearance         0.210838
E/e' rest                   -0.182637
Trigliceride                -0.166551
Smoker                       0.104761
IMM rest                     0.080205
Diabetes                    -0.031897
Creatinine                  -0.021371
Glukosa     

In [None]:
#Списки признаков, отобранные по RFECV 

logreg_rfecv = ['Hypertension', 'Dyslipidemia', 'Peripheral artery disease', 'Cholesterol', 'LPLD_Ch', 'Creatinine clearance', 'ESC Pre-test Probability', 'LVEDV rest', 'RVSP rest']
rf_rfecv = ['Body mass index', 'Cholesterol', 'LPLD_Ch', 'LPHD_Ch', 'Trigliceride', 'Glukosa', 'ESC Pre-test Probability', 'LVEDV rest', "E/e' rest"]
boost_rfecv  = ['Body mass index', 'LPHD_Ch', 'Trigliceride', 'ESC Pre-test Probability', "E/e' rest"]

In [13]:
X_logreg = X[logreg_rfecv]
grid_l1.fit(X_logreg, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X_logreg.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.7196856730011514
Лучший F1 (CV): 0.4737701149425287


ESC Pre-test Probability     0.670072
Dyslipidemia                 0.417796
LVEDV rest                  -0.397017
Peripheral artery disease    0.286080
Hypertension                 0.212020
Creatinine clearance         0.115034
RVSP rest                   -0.114980
Cholesterol                  0.058351
dtype: float64

In [15]:
X_rf = df[rf_rfecv]
grid_l1.fit(X_rf, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X_rf.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.19306977288832497
Лучший F1 (CV): 0.4991532684758491


ESC Pre-test Probability    0.633174
LVEDV rest                 -0.265888
Cholesterol                 0.101176
Trigliceride                0.019636
dtype: float64

In [16]:
X_boost = df[boost_rfecv]
grid_l1.fit(X_boost, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X_boost.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.19306977288832497
Лучший F1 (CV): 0.4663106395520188


ESC Pre-test Probability    0.579081
Trigliceride                0.044962
dtype: float64

Строим модель на основе лучшего набора признаков:
best_features = [
    "ESC Pre-test Probability",
    "LVEDV rest",
    "Cholesterol",
    "Trigliceride"
]

In [25]:
best_features = [
    "ESC Pre-test Probability",
    "LVEDV rest",
    "Cholesterol",
    "Trigliceride"
]

X_best = df[best_features]

pipe_baseline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=0.19306977288832497,
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ))
])

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scores = cross_val_score(
    pipe_baseline,
    X_best,
    y,
    scoring=make_scorer(f1_score),
    cv=cv,
    n_jobs=-1
)

print(f"F1 CV mean: {scores.mean():.4f}")
print(f"F1 CV std : {scores.std():.4f}")

pipe_baseline.fit(X_best, y)

coef = pipe_baseline.named_steps["logreg"].coef_.ravel()

coef_df = (
    pd.Series(coef, index=best_features)
    .sort_values(key=np.abs, ascending=False)
)

coef_df

F1 CV mean: 0.4970
F1 CV std : 0.1301


ESC Pre-test Probability    0.633175
LVEDV rest                 -0.265873
Cholesterol                 0.101172
Trigliceride                0.019626
dtype: float64

In [26]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score

scoring = {
    "f1": make_scorer(f1_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "accuracy": make_scorer(accuracy_score)
}

cv_results = cross_validate(
    pipe_baseline,
    X_best,
    y,
    scoring=scoring,
    cv=cv,
    n_jobs=-1
)

metrics_df = pd.DataFrame({
    "F1 mean": [cv_results["test_f1"].mean()],
    "F1 std": [cv_results["test_f1"].std()],
    "Precision mean": [cv_results["test_precision"].mean()],
    "Precision std": [cv_results["test_precision"].std()],
    "Recall mean": [cv_results["test_recall"].mean()],
    "Recall std": [cv_results["test_recall"].std()],
    "Accuracy mean": [cv_results["test_accuracy"].mean()],
    "Accuracy std": [cv_results["test_accuracy"].std()],
})

metrics_df

Unnamed: 0,F1 mean,F1 std,Precision mean,Precision std,Recall mean,Recall std,Accuracy mean,Accuracy std
0,0.497037,0.130115,0.387903,0.104919,0.703636,0.201941,0.657143,0.085978


Проверяем, будут ли лучше метрики на модели случайного леса для того же набора признаков

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score

best_features = [
    "ESC Pre-test Probability",
    "LVEDV rest",
    "Cholesterol",
    "Trigliceride"
]

X_best = df[best_features]

rf_baseline = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

scoring = {
    "f1": make_scorer(f1_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "accuracy": make_scorer(accuracy_score)
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

rf_cv_results = cross_validate(
    rf_baseline,
    X_best,
    y,
    scoring=scoring,
    cv=cv,
    n_jobs=-1
)

rf_metrics_df = pd.DataFrame({
    "F1 mean": [rf_cv_results["test_f1"].mean()],
    "F1 std": [rf_cv_results["test_f1"].std()],
    "Precision mean": [rf_cv_results["test_precision"].mean()],
    "Precision std": [rf_cv_results["test_precision"].std()],
    "Recall mean": [rf_cv_results["test_recall"].mean()],
    "Recall std": [rf_cv_results["test_recall"].std()],
    "Accuracy mean": [rf_cv_results["test_accuracy"].mean()],
    "Accuracy std": [rf_cv_results["test_accuracy"].std()],
})

rf_metrics_df

Unnamed: 0,F1 mean,F1 std,Precision mean,Precision std,Recall mean,Recall std,Accuracy mean,Accuracy std
0,0.229641,0.16481,0.469048,0.343683,0.172727,0.136424,0.738095,0.045175


In [40]:
rf_baseline.fit(X, y)

importances = pd.Series(
    rf_baseline.feature_importances_,
    index=X.columns.tolist()
).sort_values(ascending=False)

importances

ESC Pre-test Probability     0.111625
LVESVi at rest               0.067624
LPHD_Ch                      0.055149
Trigliceride                 0.050244
Body surface area            0.049449
Glukosa                      0.047284
LVEF% at rest                0.044397
IMM rest                     0.043301
Height                       0.040862
LVESV  rest                  0.039238
E/e' rest                    0.038915
Weight                       0.038652
Cholesterol                  0.038283
LVEDVi at rest               0.037590
Body mass index              0.037434
Creatinine clearance         0.035591
LPLD_Ch                      0.034896
LAVI rest                    0.033854
LVEDV rest                   0.031758
Creatinine                   0.029208
RVSP rest                    0.028888
Peripheral artery disease    0.018228
Carotid artery disease       0.015925
Dyslipidemia                 0.011240
Smoker                       0.008891
Obesity                      0.004568
Hypertension