In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import os

from sklearn.preprocessing import PolynomialFeatures
from autofeat import AutoFeatClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score

from hyperopt import fmin, tpe, hp, Trials
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from mlflow.tracking import MlflowClient
import mlflow.catboost
from catboost import CatBoostClassifier

  import pkg_resources


In [2]:
os.chdir('..')
os.getcwd()

'D:\\git repos\\lab1'

In [3]:
def plot_monthly_bucket_stats_by_count(
    spark_df, score_col, target_col, date_col,
    obs_in_bin=1000, figsize=(12, 6), score_min=None, score_max=None
):
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    from pyspark.sql import functions as F

    sdf = spark_df.withColumn("month", F.date_format(F.col(date_col), "yyyy-MM-01"))
    months = [row["month"] for row in sdf.select("month").orderBy(F.asc('month')).distinct().collect()]
    
    for month in sorted(months):
        sdf_month = sdf.filter(F.col("month") == month)
        pdf = sdf_month.select(score_col, target_col).toPandas()
        pdf = pdf.dropna(subset=[score_col, target_col])

        # Количество бакетов
        n_obs = len(pdf)
        n_bins = max(1, int(n_obs / obs_in_bin))

        # Квантильное разбиение
        pdf["bin"], bins = pd.qcut(pdf[score_col], q=n_bins, retbins=True, duplicates='drop')
        grouped = pdf.groupby("bin", observed=True).agg(
            obs_count=(target_col, "size"),
            sum_target=(target_col, "sum"),
            dr=(target_col, "mean")
        )
        grouped['mean_target'] = grouped['sum_target'] / n_obs
        grouped["label"] = [
            f"{round(cnt/n_obs*100, 4)} ({int(cnt)})" for cnt in grouped["sum_target"]
        ]

        fig, ax1 = plt.subplots(figsize=figsize)
        ax2 = ax1.twinx()


        # Бар по категориям (индексам, не координатам)
        grouped["obs_count"].plot(
            kind="bar", color="skyblue", alpha=0.90, ax=ax1, width=0.8, label="Количество наблюдений"
        )
        ax1.set_ylabel("Количество наблюдений", color="steelblue")
        ax1.set_ylim(0, grouped["obs_count"].max() * 1.04)
        ax1.tick_params(axis="y", labelcolor="steelblue")

        # Линия по позициям баров (по индексу от 0 до кол-ва бинов)
        indices = range(len(grouped))
        ax2.plot(
            indices, grouped["mean_target"], color="crimson", marker="o", linestyle="--", label="DR (mean target) %"
        )
        ax2.set_ylabel("DR (mean target) %", color="crimson")
        ax2.tick_params(axis="y", labelcolor="crimson")
        ax2.set_ylim(0, grouped["mean_target"].max() * 1.1)

        for i, (y, label) in enumerate(zip(grouped["mean_target"], grouped["label"])):
            ax2.annotate(label, (i, y), color="crimson", fontsize=9, ha="center", xytext=(0,10), textcoords="offset points")

        plt.title(f"Срез {month}")
        ax1.legend(loc="upper left")
        ax2.legend(loc="upper right")
        plt.xlabel("Бакеты оценок")
        ax1.set_xticks(indices)
        ax1.set_xticklabels([str(b) for b in grouped.index], rotation=30, ha="center")
        plt.grid(axis="y", alpha=0.3)
        plt.tight_layout()
        plt.show()

In [4]:
df = pd.read_pickle('data/clean_dataset.pkl')

In [13]:
df[df['target'] == 0]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,high_age
165,67,1,0,160,286,0,0,108,1,1.5,1,3,2,0,1
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3,0,1
167,62,0,0,140,268,0,0,160,0,3.6,0,2,2,0,1
168,63,1,0,130,254,0,0,147,0,1.4,1,1,3,0,1
169,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,1
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,0


In [6]:
target = 'target'
X = df.drop(columns=[target], axis=1)
y = df[target]

In [7]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 302, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
print("\nУникальные значения в категориальных признаках:")
for col in df.columns:
    if df[col].dtype == "object" or len(df[col].unique()) < 10:
        df[col] = df[col].astype('category')
        print(f"{col}: {df[col].unique()}")


Уникальные значения в категориальных признаках:
sex: [1, 0]
Categories (2, int64): [0, 1]
cp: [3, 2, 1, 0]
Categories (4, int64): [0, 1, 2, 3]
fbs: [1, 0]
Categories (2, int64): [0, 1]
restecg: [0, 1, 2]
Categories (3, int64): [0, 1, 2]
exang: [0, 1]
Categories (2, int64): [0, 1]
slope: [0, 2, 1]
Categories (3, int64): [0, 1, 2]
ca: [0, 2, 1, 3, 4]
Categories (5, int64): [0, 1, 2, 3, 4]
thal: [1, 2, 3, 0]
Categories (4, int64): [0, 1, 2, 3]
target: [1, 0]
Categories (2, int64): [0, 1]
high_age: [1, 0]
Categories (2, int64): [0, 1]


In [10]:
num_features  = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features  = [col for col in df.columns if col not in num_features + ['target']]
print("\nЧисловые признаки:", num_features)
print("Категориальные признаки:", cat_features)


Числовые признаки: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Категориальные признаки: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'high_age']


In [11]:
numeric_transformer = StandardScaler()
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

In [12]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

baseline_model = Pipeline(steps=[
    ("preprocess", preprocess), 
    ("model", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [11]:
os.makedirs("./mlflow", exist_ok=True)
mlflow.set_tracking_uri("sqlite:///./mlflow/mlruns.db")
mlflow.set_experiment("IIS_Lab2")

2025/10/15 10:39:02 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/15 10:39:02 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='file:D:/git repos/lab1/mlruns/1', creation_time=1760513944024, experiment_id='1', last_update_time=1760513944024, lifecycle_stage='active', name='IIS_Lab2', tags={}>

In [18]:
with mlflow.start_run(run_name="Baseline_RandomForest_Classifier"):
    baseline_model.fit(X_train, y_train)
    
    y_pred = baseline_model.predict(X_test)
    y_proba = baseline_model.predict_proba(X_test)[:, 1]  # для roc_auc

    # Вычисляем метрики
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    
    signature = infer_signature(X_train, baseline_model.predict(X_train.head()))

    mlflow.sklearn.log_model(baseline_model, artifact_path="model", signature=signature, input_example=X_train.head())

    print(f"Accuracy={accuracy:.4f}, F1-score={f1:.4f}, ROC_AUC={roc_auc:.4f}")



Accuracy=0.8553, F1-score=0.8642, ROC_AUC=0.9265


In [23]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_num = X_train[num_features]
X_test_num = X_test[num_features]

X_train_poly = poly.fit_transform(X_train_num)
X_test_poly = poly.transform(X_test_num)

# соединяем с категориальными признаками
X_train_new = np.hstack([X_train_poly, X_train[cat_features]])
X_test_new = np.hstack([X_test_poly, X_test[cat_features]])

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_new, y_train)

y_pred = clf.predict(X_test_new)
y_proba = clf.predict_proba(X_test_new)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

with mlflow.start_run(run_name="PolyFeatures_RF"):
    signature = infer_signature(X_train_new, clf.predict(X_train_new[:5]))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.sklearn.log_model(clf, artifact_path="model", signature=signature, input_example=X_train_new[:5])




In [26]:
af_clf = AutoFeatClassifier(verbose=1)
af_clf.fit(X_train, y_train)
X_train_af = af_clf.transform(X_train)
X_test_af = af_clf.transform(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_af, y_train)

y_pred = clf.predict(X_test_af)
y_proba = clf.predict_proba(X_test_af)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

with mlflow.start_run(run_name="autofeat_RF"):
    signature = infer_signature(X_train_af, clf.predict(X_train_af[:5]))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.sklearn.log_model(clf, artifact_path="model", signature=signature, input_example=X_train_af[:5])

[AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
[AutoFeat] The 2 step feature engineering process could generate up to 4851 features.
[AutoFeat] With 226 data points this new feature matrix would use about 0.00 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 44 transformed features from 14 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 1629 feature combinations from 1653 original feature tuples - done.
[feateng] Generated altogether 1682 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 902 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 16 features after 5 feature selection runs


  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


[featsel] 6 features after noise filtering
[AutoFeat] Computing 6 new features.
[AutoFeat]     6/    6 new features ...done.
[AutoFeat] Final dataframe with 20 feature columns (6 new).
[AutoFeat] Training final classification model.
[AutoFeat] Trained model: largest coefficients:
[-0.04944054]
1.957567 * sqrt(oldpeak)*sex
0.849511 * sqrt(ca)*sex
0.229975 * sqrt(oldpeak)*slope**3
0.179075 * ca**3*sqrt(restecg)
[AutoFeat] Final score: 0.8274
[AutoFeat] Computing 6 new features.
[AutoFeat]     6/    6 new features ...done.
[AutoFeat] Computing 6 new features.
[AutoFeat]     6/    6 new features ...done.




In [29]:
n_features = int(X_train.shape[1] * 0.6)

sfs = SFS(RandomForestClassifier(n_estimators=50), 
          k_features=n_features, 
          forward=True, scoring='accuracy', cv=3, n_jobs=-1)
sfs.fit(X_train, y_train)

selected_features = list(sfs.k_feature_names_)
X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_sel, y_train)

y_pred = clf.predict(X_test_sel)
y_proba = clf.predict_proba(X_test_sel)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

with mlflow.start_run(run_name="Selector_RF"):
    signature = infer_signature(X_train_sel, clf.predict(X_train_sel[:5]))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.sklearn.log_model(clf, artifact_path="model", signature=signature, input_example=X_train_sel[:5])



In [31]:
rfe = RFE(RandomForestClassifier(n_estimators=50), n_features_to_select=n_features)
rfe.fit(X_train, y_train)

rfe_features = X_train.columns[rfe.support_]
X_train_rfe = X_train[rfe_features]
X_test_rfe = X_test[rfe_features]

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_rfe, y_train)

y_pred = clf.predict(X_test_rfe)
y_proba = clf.predict_proba(X_test_rfe)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

with mlflow.start_run(run_name="RFE_RF"):
    signature = infer_signature(X_train_rfe, clf.predict(X_train_rfe[:5]))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.sklearn.log_model(clf, artifact_path="model", signature=signature, input_example=X_train_rfe[:5])

# сравнение выбранных feature_sets:
print("SFS:", selected_features)
print("RFE:", list(rfe_features))



SFS: ['cp', 'chol', 'restecg', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
RFE: ['age', 'cp', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca', 'thal']


In [33]:
set(selected_features) - set(list(rfe_features))

{'exang', 'restecg', 'slope'}

In [43]:
from sklearn.metrics import make_scorer, f1_score

f1_scorer = make_scorer(f1_score)
num_total_features = X_train.shape[1]

In [44]:
def objective(params):
    n_features = int(params['n_features'])
    kbest = SelectKBest(score_func=f_classif, k=n_features)
    X_train_kbest = kbest.fit_transform(X_train, y_train)

    clf = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth'])
    )
    # Используем scoring=f1_scorer
    score = cross_val_score(clf, X_train_kbest, y_train, cv=3, scoring=f1_scorer).mean()
    return -score

In [45]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'n_features': hp.quniform('n_features', int(0.2 * num_total_features), int(0.7 * num_total_features), 1)
}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=40, trials=trials)

100%|██████████| 40/40 [00:18<00:00,  2.17trial/s, best loss: -0.851874462985574] 


In [46]:
n_features_opt = int(best['n_features'])
kbest_final = SelectKBest(score_func=f_classif, k=n_features_opt)
X_train_kbest = kbest_final.fit_transform(X_train, y_train)
X_test_kbest = kbest_final.transform(X_test)

In [52]:
kbest_final.get_feature_names_out().tolist()

['age', 'sex', 'cp', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [53]:
X_train_kbest = pd.DataFrame(X_train_kbest, columns=kbest_final.get_feature_names_out().tolist())
X_test_kbest = pd.DataFrame(X_test_kbest, columns=kbest_final.get_feature_names_out().tolist())

In [57]:
best

{'max_depth': np.float64(3.0),
 'n_estimators': np.float64(166.0),
 'n_features': np.float64(9.0)}

In [64]:
best_clf = RandomForestClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth'])
)
best_clf.fit(X_train_kbest, y_train)

y_pred = best_clf.predict(X_test_kbest)
y_proba = best_clf.predict_proba(X_test_kbest)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

with mlflow.start_run(run_name="hyperopt_RF_kfeatures"):
    signature = infer_signature(X_train_kbest, best_clf.predict(X_train_kbest[:5]))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.sklearn.log_model(best_clf, artifact_path="model", signature=signature, input_example=X_train_kbest[:5])



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [69]:
cat_clf = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    eval_metric='F1',
    verbose=False
)
# X_test_kbest
cat_clf.fit(X_train_kbest, y_train)

y_pred = cat_clf.predict(X_test_kbest)
y_proba = cat_clf.predict_proba(X_test_kbest)[:, 1] 

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 на тесте: {f1:.4f}")

F1 на тесте: 0.8537


In [72]:
with mlflow.start_run(run_name="CatBoost_Model") as run:
    signature = infer_signature(X_train_kbest, cat_clf.predict(X_train_kbest))
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("F1_score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)
    mlflow.catboost.log_model(cat_clf, artifact_path="model", signature=signature, input_example=X_train_kbest.head())

    mlflow.register_model(f"runs:/{run.info.run_id}/model", "CatBoost_Model")



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'CatBoost_Model' already exists. Creating a new version of this model...
Created version '3' of model 'CatBoost_Model'.


### После анализа метрик лучшая модель -> полученная с помощью подбора гиперпараметров на hyperopt в RF

In [76]:
best_model = RandomForestClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth'])
)

In [78]:
X_best = pd.concat([X_train_kbest, X_test_kbest], axis=0)
y_best = pd.concat([y_train, y_test], axis=0)

In [81]:
X_best.columns.tolist()

['age', 'sex', 'cp', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [79]:
best_model.fit(X_best, y_best)

0,1,2
,n_estimators,166
,criterion,'gini'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [82]:
with mlflow.start_run(run_name="Final_model_best_run") as run:
    signature = infer_signature(X_best, best_model.predict(X_best))
    
    mlflow.sklearn.log_model(
        best_model, 
        artifact_path="model", 
        signature=signature, 
        input_example=X_best.head(),
        registered_model_name="RF_Final_Production"
    )
    
    # Лог requirements.txt как артефакт
    mlflow.log_artifact("requirements.txt")
    
    # Лог списка признаков
    import json
    with open("best_features.json", "w") as f:
        json.dump(X_best.columns.tolist(), f)
    mlflow.log_artifact("best_features.json")



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'RF_Final_Production'.
Created version '1' of model 'RF_Final_Production'.


In [84]:
client = MlflowClient()
model_name = "RF_Final_Production"
latest_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
client.transition_model_version_stage(
    name=model_name, 
    version=latest_version, 
    stage="Production"
)

  latest_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1760560083550, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1760560141136, metrics=None, model_id=None, name='RF_Final_Production', params=None, run_id='be20b1f6e6474e7b9f90129d3bc5ec39', run_link=None, source='models:/m-9d3eb6a2f65e46929109907ba32b7499', status='READY', status_message=None, tags={}, user_id=None, version=1>