In [None]:
import pandas as pd
import os
import pickle
import shap
import re
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro
from pandas.api.types import is_integer_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import  (
    fbeta_score, f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, make_scorer, average_precision_score
)
from statsmodels.stats.outliers_influence import variance_inflation_factor
import shap
from typing import Optional, List

import warnings
warnings.filterwarnings('ignore')

In [None]:
RANDOM_STATE = 42
TEST_SIZE = 0.25
TARGET_COL = "heart_attack_risk_binary"
K_BEST       = 15                    
SCORE_FUNC   = f_classif

In [None]:
heart_train = pd.read_csv('C:/Users/malen/OneDrive/Desktop/data_since/мастерская 1/data/raw/heart_train.csv', index_col='id')
heart_train.head()

In [None]:
heart_test = pd.read_csv('C:/Users/malen/OneDrive/Desktop/data_since/мастерская 1/data/raw/heart_test.csv', index_col='id')
heart_test.head()

In [None]:
heart_train = heart_train.drop('Unnamed: 0', axis=1)
heart_train.info()

In [None]:
heart_test = heart_test.drop('Unnamed: 0', axis=1)
heart_test.info()

In [None]:
def to_snake_case(name):
    name = name.strip()
    name = re.sub(r'[\s\-]+', '_', name)               # пробелы и дефисы → _
    name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name) # camelCase → camel_case
    name = re.sub(r'[^\w_]', '', name)                  # убрать лишние символы
    return name.lower()

heart_train.columns = [to_snake_case(c) for c in heart_train.columns]
heart_test.columns = [to_snake_case(c) for c in heart_test.columns]

In [None]:
def normalize_gender(series: pd.Series) -> pd.Series:
    """
    male/female, '1'/'0', '1.0'/'0.0' -> {1.0, 0.0}, NaN сохраняем
    """
    s = series.astype(str).str.strip().str.lower()
    s = (s.replace({"male":"1","female":"0"})
           .str.replace(".0","", regex=False)
           .replace({"nan": np.nan}))
    return pd.to_numeric(s, errors="coerce")
    
if "gender" in heart_train.columns:
    heart_train["gender"] = normalize_gender(heart_train["gender"])
if "gender" in heart_test.columns:
    heart_test["gender"] = normalize_gender(heart_test["gender"])

In [None]:
# === Удаляем признаки-утечки прямо из DataFrame: CK-MB и Troponin ==============
heart_train = heart_train.drop(columns=["ck_mb", "troponin"], errors="ignore")

In [None]:
# ---------------- кастомные трансформеры ---------------- #
class GroupMedianImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        self.medians_ = X_df.median(numeric_only=False)
        self.feature_names_in_ = X_df.columns.to_list()
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.feature_names_in_).fillna(self.medians_).values
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_in_ if input_features is None else input_features)

class ModeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        self.modes_ = X_df.mode(dropna=True).iloc[0]
        self.feature_names_in_ = X_df.columns.to_list()
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.feature_names_in_).fillna(self.modes_).values
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_in_ if input_features is None else input_features)

class BinaryCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_in_ = pd.DataFrame(X).columns.to_list()
        return self
    def transform(self, X):
        arr = np.asarray(X, dtype=float)
        arr = np.rint(arr)
        arr = np.clip(arr, 0, 1)
        return arr.astype(np.int8)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_in_ if input_features is None else input_features)

class MissingIndicatorSimple(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_in_ = pd.DataFrame(X).columns.to_list()
        self.out_names_ = [f"{c}__was_missing" for c in self.feature_names_in_]
        return self
    def transform(self, X):
        X_df = pd.DataFrame(X, columns=self.feature_names_in_)
        return X_df.isna().astype(np.int8).values
    def get_feature_names_out(self, input_features=None):
        return np.array(self.out_names_)

# ---------------- списки признаков под наш проект ---------------- #
# NB: ck_mb и troponin уже удалены из самого df; формируем фичи без них
all_cols = [c for c in heart_train.columns if c != TARGET_COL]

binary_features = [
    "diabetes","family_history","smoking","obesity","alcohol_consumption",
    "previous_heart_problems","medication_use","gender"
]
binary_features = [c for c in binary_features if c in all_cols]

ordinal_features = [
    "diet", "stress_level", "physical_activity_days_per_week"
]
ordinal_features = [c for c in ordinal_features if c in all_cols]

numeric_features = [c for c in all_cols if c not in binary_features + ordinal_features]

# ---------------- пайплайны по типам ---------------- #
num_pipe = Pipeline([
    ("imp", GroupMedianImputer()),
    # ("scaler", StandardScaler()),
])

ord_pipe = Pipeline([
    ("imp", GroupMedianImputer())
])

bin_pipe = Pipeline([
    ("imp", ModeImputer()),
    ("bin", BinaryCleaner())
])

# ---------------- единый препроцессор ---------------- #
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_features),
        ("ord", ord_pipe, ordinal_features),
        ("bin", bin_pipe, binary_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)


In [None]:
LEAK_COLS = [c for c in ["ck_mb", "troponin"] if c in heart_train.columns]
# ====== данные ======
use_cols = [c for c in heart_train.columns if c not in LEAK_COLS + [TARGET_COL]]
X = heart_train[use_cols].copy()
y = heart_train[TARGET_COL].astype(int).copy()

# ====== пайплайн: препроцессор -> SelectKBest -> RandomForest (твои лучшие параметры) ======
rf_best = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=2,
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

pipe = Pipeline([
    ("prep",   preprocessor),                  # твой ColumnTransformer
    ("select", SelectKBest(score_func=SCORE_FUNC, k=K_BEST)),
    ("clf",    rf_best),
])

# ====== hold-out валидация ======
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# обучаем на train
pipe.fit(X_tr, y_tr)