In [18]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, f1_score

In [4]:
df = pd.read_excel('../data/processed/df_prepared.xlsx')

In [7]:
df['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)'].value_counts()

FU1+2+3 coronary event (Death+AMI+ACS+Revasc)
0    159
1     51
Name: count, dtype: int64

In [8]:
X = df.drop(columns=['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)'])
y = df['FU1+2+3 coronary event (Death+AMI+ACS+Revasc)']

In [9]:
pipe_l1 = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        penalty="l1",
        solver="liblinear",  
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ))
])

param_grid = {
    "logreg__C": np.logspace(-3, 1, 15)
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scorer = make_scorer(f1_score)

grid_l1 = GridSearchCV(
    estimator=pipe_l1,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    refit=True
)

grid_l1.fit(X, y)

0,1,2
,estimator,Pipeline(step...liblinear'))])
,param_grid,{'logreg__C': array([1.0000...00000000e+01])}
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [10]:
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

Лучший C: 10.0
Лучший F1 (CV): 0.46672904483430794


In [11]:
best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

LVEDV rest                   7.781066
LVEDVi at rest              -7.491336
Weight                      -6.384917
Body mass index              3.044573
LVESVi at rest               2.488959
LVESV  rest                 -2.020454
Height                       1.693633
Cholesterol                  1.091242
LPLD_Ch                     -0.970976
ESC Pre-test Probability     0.827847
LVEF% at rest                0.669067
Dyslipidemia                 0.468955
Obesity                      0.394543
Peripheral artery disease    0.362888
Hypertension                 0.342336
RVSP rest                   -0.304068
LPHD_Ch                     -0.253665
LAVI rest                    0.240942
Carotid artery disease       0.239834
Creatinine clearance         0.210838
E/e' rest                   -0.182637
Trigliceride                -0.166551
Smoker                       0.104761
IMM rest                     0.080205
Diabetes                    -0.031897
Creatinine                  -0.021371
Glukosa     

In [12]:
logreg_rfecv = ['Hypertension', 'Dyslipidemia', 'Peripheral artery disease', 'Cholesterol', 'LPLD_Ch', 'Creatinine clearance', 'ESC Pre-test Probability', 'LVEDV rest', 'RVSP rest']
rf_rfecv = ['Body mass index', 'Cholesterol', 'LPLD_Ch', 'LPHD_Ch', 'Trigliceride', 'Glukosa', 'ESC Pre-test Probability', 'LVEDV rest', "E/e' rest"]
boost_rfecv  = ['Body mass index', 'LPHD_Ch', 'Trigliceride', 'ESC Pre-test Probability', "E/e' rest"]

In [15]:
X = df[logreg_rfecv]
grid_l1.fit(X, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.7196856730011514
Лучший F1 (CV): 0.4737701149425287


ESC Pre-test Probability     0.670072
Dyslipidemia                 0.417796
LVEDV rest                  -0.397017
Peripheral artery disease    0.286080
Hypertension                 0.212020
Creatinine clearance         0.115034
RVSP rest                   -0.114980
Cholesterol                  0.058351
dtype: float64

In [16]:
X = df[rf_rfecv]
grid_l1.fit(X, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.19306977288832497
Лучший F1 (CV): 0.4991532684758491


ESC Pre-test Probability    0.633174
LVEDV rest                 -0.265888
Cholesterol                 0.101176
Trigliceride                0.019636
dtype: float64

In [17]:
X = df[boost_rfecv]
grid_l1.fit(X, y)
print("Лучший C:", grid_l1.best_params_["logreg__C"])
print("Лучший F1 (CV):", grid_l1.best_score_)

best_model = grid_l1.best_estimator_

coef = best_model.named_steps["logreg"].coef_.ravel()
feature_names = X.columns

selected_features = pd.Series(coef, index=feature_names)
selected_features = selected_features[selected_features != 0].sort_values(key=np.abs, ascending=False)

selected_features

Лучший C: 0.19306977288832497
Лучший F1 (CV): 0.4663106395520188


ESC Pre-test Probability    0.579081
Trigliceride                0.044962
dtype: float64

Строим модель на основе лучшего набора признаков:
best_features = [
    "ESC Pre-test Probability",
    "LVEDV rest",
    "Cholesterol",
    "Trigliceride"
]

In [19]:
best_features = [
    "ESC Pre-test Probability",
    "LVEDV rest",
    "Cholesterol",
    "Trigliceride"
]

X_best = df[best_features]

pipe_baseline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=0.19306977288832497,
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ))
])

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scores = cross_val_score(
    pipe_baseline,
    X_best,
    y,
    scoring=make_scorer(f1_score),
    cv=cv,
    n_jobs=-1
)

print(f"F1 CV mean: {scores.mean():.4f}")
print(f"F1 CV std : {scores.std():.4f}")

F1 CV mean: 0.4970
F1 CV std : 0.1301


In [20]:
pipe_baseline.fit(X_best, y)

coef = pipe_baseline.named_steps["logreg"].coef_.ravel()

coef_df = (
    pd.Series(coef, index=best_features)
    .sort_values(key=np.abs, ascending=False)
)

coef_df

ESC Pre-test Probability    0.633175
LVEDV rest                 -0.265873
Cholesterol                 0.101172
Trigliceride                0.019626
dtype: float64