In [7]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    average_precision_score,
    confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay, PrecisionRecallDisplay
)

from sklearn.inspection import permutation_importance
import joblib

In [17]:
ART_DIR = "artifacts"
FIG_DIR = "artifacts/figures"
DATA_PATH = "s6/S06-hw-dataset-04.csv"
df = pd.read_csv(DATA_PATH)

display(df.head())
df.info()
display(df.describe(include="all").T)

print("Missing values per column (top-10):")
display(df.isna().sum().sort_values(ascending=False).head(10))

print("Target distribution:")
display(df["target"].value_counts(dropna=False))
display(df["target"].value_counts(normalize=True))

Unnamed: 0,id,f01,f02,f03,f04,f05,f06,f07,f08,f09,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
0,1,-1.25021,1.423474,-0.225004,-4.023138,-0.832729,-0.550874,1.77209,2.76169,-0.69875,...,10.938269,0.501178,1.600001,0.314212,1.209735,1.355697,-5.338924,1.153944,-0.153934,0
1,2,0.074328,0.376429,0.212831,-0.502074,2.017405,0.625496,1.943785,1.24203,-0.52409,...,7.775262,-4.550195,6.272586,-0.932162,-0.228543,1.73522,-3.827828,0.292165,0.27372,0
2,3,0.638481,0.060968,0.74676,2.479653,-0.292858,-0.078139,-2.918423,-0.013186,1.009135,...,-4.448447,-9.593179,-3.093519,0.029321,0.605511,0.829103,-0.085985,2.891408,0.766221,0
3,4,1.712916,-1.350969,-0.256473,1.622074,-0.445141,0.911932,-3.440345,1.505192,-1.104348,...,-1.619072,-3.237479,-5.474038,-1.582475,0.198137,3.823409,0.880395,1.14861,0.136732,0
4,5,0.905676,-0.206545,-0.068806,4.086026,-1.010045,-0.772644,-4.207688,2.506104,1.589143,...,-2.396844,-10.540129,-5.532811,-1.231203,0.000119,4.298572,-1.558235,0.924673,0.111668,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 62 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      25000 non-null  int64  
 1   f01     25000 non-null  float64
 2   f02     25000 non-null  float64
 3   f03     25000 non-null  float64
 4   f04     25000 non-null  float64
 5   f05     25000 non-null  float64
 6   f06     25000 non-null  float64
 7   f07     25000 non-null  float64
 8   f08     25000 non-null  float64
 9   f09     25000 non-null  float64
 10  f10     25000 non-null  float64
 11  f11     25000 non-null  float64
 12  f12     25000 non-null  float64
 13  f13     25000 non-null  float64
 14  f14     25000 non-null  float64
 15  f15     25000 non-null  float64
 16  f16     25000 non-null  float64
 17  f17     25000 non-null  float64
 18  f18     25000 non-null  float64
 19  f19     25000 non-null  float64
 20  f20     25000 non-null  float64
 21  f21     25000 non-null  float64
 22

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,25000.0,12500.500000,7217.022701,1.000000,6250.750000,12500.500000,18750.250000,25000.000000
f01,25000.0,-0.000386,1.001623,-4.370993,-0.680165,0.001859,0.679702,4.208888
f02,25000.0,-0.004872,0.995606,-4.087073,-0.675100,-0.000247,0.659523,3.984564
f03,25000.0,0.003202,1.004367,-4.103875,-0.675426,0.013272,0.683437,3.793442
f04,25000.0,0.335329,3.207537,-13.249937,-1.750048,0.403483,2.486453,15.288250
...,...,...,...,...,...,...,...,...
f57,25000.0,0.893365,2.445185,-9.508509,-0.735473,0.888535,2.516790,11.880651
f58,25000.0,-0.909479,1.962618,-7.919287,-2.226959,-0.923354,0.395648,6.778980
f59,25000.0,0.000570,0.994320,-4.038312,-0.666367,0.004381,0.666474,3.834922
f60,25000.0,-0.000754,0.997167,-3.812255,-0.665861,0.002420,0.665918,4.012639


Missing values per column (top-10):


id     0
f46    0
f33    0
f34    0
f35    0
f36    0
f37    0
f38    0
f39    0
f40    0
dtype: int64

Target distribution:


target
0    23770
1     1230
Name: count, dtype: int64

target
0    0.9508
1    0.0492
Name: proportion, dtype: float64

In [25]:
TARGET_COL = "target"
ID_COL = "id"
RANDOM_STATE = 42

X = df.drop(columns=[TARGET_COL, ID_COL], errors="ignore")
Y = df[TARGET_COL]

print(X.shape, Y.shape)

(25000, 60) (25000,)


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=Y
)

print("Train target share:", Y_train.value_counts(normalize=True).to_dict())
print("Test  target share:", Y_test.value_counts(normalize=True).to_dict())

Train target share: {0: 0.9507733333333334, 1: 0.04922666666666667}
Test  target share: {0: 0.95088, 1: 0.04912}


Мы фиксируем random state чтобы можно было явно сравинивать результаты при разных прогонах модели/гиперпараметрах и тд, а не списывать на изменчивость связанную с другим разбиением данных. stratify нужен чтобы не было перекоса данных в обучающей и тестовых выборках.

In [32]:
def eval_binary(y_true, y_pred, y_proba=None):
    out = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "f1": float(f1_score(y_true, y_pred))
    }
    if y_proba is not None:
        out["roc_auc"] = float(roc_auc_score(y_true, y_proba))
        out["average_precision"] = float(average_precision_score(y_true, y_proba))
    return out

In [34]:
def save_confusion(y_true, y_pred, path):
    fig, ax = plt.subplots(figsize=(5, 5))
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, ax=ax, values_format="d")
    ax.set_title("Confusion matrix")
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)

def save_roc(y_true, y_proba, path):
    fig, ax = plt.subplots(figsize=(6, 5))
    RocCurveDisplay.from_predictions(y_true, y_proba, ax=ax)
    ax.set_title("ROC curve")
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)

def save_pr(y_true, y_proba, path):
    fig, ax = plt.subplots(figsize=(6, 5))
    PrecisionRecallDisplay.from_predictions(y_true, y_proba, ax=ax)
    ax.set_title("Precision-Recall curve")
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    plt.close(fig)

In [46]:
# DummyClassifier
dummy = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
dummy.fit(X_train, Y_train)

dummy_pred = dummy.predict(X_test)
dummy_metrics = eval_binary(Y_test, dummy_pred, y_proba=None)
dummy_metrics

{'accuracy': 0.95088, 'f1': 0.0}

In [50]:
# LogisticRegression
logreg = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE))
])

logreg.fit(X_train, Y_train)
lr_pred = logreg.predict(X_test)
lr_proba = logreg.predict_proba(X_test)[:, 1]

lr_metrics = eval_binary(Y_test, lr_pred, lr_proba)
lr_metrics

{'accuracy': 0.7792,
 'f1': 0.2572658772874058,
 'roc_auc': 0.8418608704517014,
 'average_precision': 0.45700182544850415}

ХОтя точность моделей в обоих случаях достаточно высокая, но по f1 мере понятно что изза доминирования одного класса модели очень плохо предсказывают, то есть имеют высокую точность только для одного класса.

In [54]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
SCORING = "roc_auc"

In [63]:
# DecisionTreeClassifier
dt_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight="balanced"))
])

dt_param_grid = {
    "model__max_depth": [2, 3, 4, 5, 7, 10, None],
    "model__min_samples_leaf": [1, 5, 10, 20, 50],
    "model__ccp_alpha": [0.0, 1e-4, 1e-3, 1e-2]
}

dt_search = GridSearchCV(
    dt_pipe, dt_param_grid,
    cv=cv, scoring=SCORING, n_jobs=-1, refit=True
)
dt_search.fit(X_train, Y_train)

dt_best = dt_search.best_estimator_
dt_search.best_params_, dt_search.best_score_

({'model__ccp_alpha': 0.001,
  'model__max_depth': 7,
  'model__min_samples_leaf': 50},
 0.8326008576370301)

In [69]:
dt_pred = dt_best.predict(X_test)
dt_proba = dt_best.predict_proba(X_test)[:, 1]
dt_metrics = eval_binary(Y_test, dt_pred, dt_proba)
dt_metrics

{'accuracy': 0.872,
 'f1': 0.35691318327974275,
 'roc_auc': 0.8396391670928106,
 'average_precision': 0.37644537781268905}

In [86]:
# RandomForestClassifier
rf_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=100,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ))
])

rf_param_grid = {
    "model__max_depth": [4, 6],
    "model__min_samples_leaf": [5, 10],
    "model__max_features": ["sqrt", 0.5]
}

rf_search = GridSearchCV(
    rf_pipe, rf_param_grid,
    cv=cv, scoring=SCORING, n_jobs=-1, refit=True
)
rf_search.fit(X_train, Y_train)

rf_best = rf_search.best_estimator_
rf_search.best_params_, rf_search.best_score_

({'model__max_depth': 6,
  'model__max_features': 0.5,
  'model__min_samples_leaf': 5},
 0.8872548669808442)

In [88]:
rf_pred = rf_best.predict(X_test)
rf_proba = rf_best.predict_proba(X_test)[:, 1]
rf_metrics = eval_binary(Y_test, rf_pred, rf_proba)
rf_metrics

{'accuracy': 0.95248,
 'f1': 0.6013422818791946,
 'roc_auc': 0.8976048793615349,
 'average_precision': 0.7178859576372012}

In [90]:
# HistGradientBoostingClassifier
hgb_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])

hgb_param_grid = {
    "model__learning_rate": [0.03, 0.05],
    "model__max_depth": [2, 3, 5],
    "model__max_iter": [200, 400],
    "model__min_samples_leaf": [20, 50]
}

hgb_search = GridSearchCV(
    hgb_pipe, hgb_param_grid,
    cv=cv, scoring=SCORING, n_jobs=-1, refit=True
)
hgb_search.fit(X_train, Y_train)

hgb_best = hgb_search.best_estimator_
hgb_search.best_params_, hgb_search.best_score_

({'model__learning_rate': 0.05,
  'model__max_depth': 5,
  'model__max_iter': 200,
  'model__min_samples_leaf': 50},
 0.8959266845606839)

In [91]:
hgb_pred = hgb_best.predict(X_test)
hgb_proba = hgb_best.predict_proba(X_test)[:, 1]
hgb_metrics = eval_binary(Y_test, hgb_pred, hgb_proba)
hgb_metrics

{'accuracy': 0.9792,
 'f1': 0.7368421052631579,
 'roc_auc': 0.9024187983454107,
 'average_precision': 0.7912076992493047}

In [94]:
metrics_test = {
    "dummy_most_frequent": dummy_metrics,
    "logreg_balanced": lr_metrics,
    "decision_tree": dt_metrics,
    "random_forest": rf_metrics,
    "hist_gb": hgb_metrics
}

pd.DataFrame(metrics_test).T.sort_values("roc_auc", ascending=False)

Unnamed: 0,accuracy,f1,roc_auc,average_precision
hist_gb,0.9792,0.736842,0.902419,0.791208
random_forest,0.95248,0.601342,0.897605,0.717886
logreg_balanced,0.7792,0.257266,0.841861,0.457002
decision_tree,0.872,0.356913,0.839639,0.376445
dummy_most_frequent,0.95088,0.0,,


In [143]:
df["target"].value_counts(), df["target"].value_counts(normalize=True)

(target
 0    23770
 1     1230
 Name: count, dtype: int64,
 target
 0    0.9508
 1    0.0492
 Name: proportion, dtype: float64)

In [96]:
leader_name = max(
    [k for k in metrics_test.keys() if "roc_auc" in metrics_test[k]],
    key=lambda k: metrics_test[k]["roc_auc"]
)
leader_name

'hist_gb'

In [98]:
models = {
    "dummy_most_frequent": dummy,
    "logreg_balanced": logreg,
    "decision_tree": dt_best,
    "random_forest": rf_best,
    "hist_gb": hgb_best
}

best_model = models[leader_name]

In [104]:
best_pred = best_model.predict(X_test)
best_proba = best_model.predict_proba(X_test)[:, 1]

save_confusion(Y_test, best_pred, FIG_DIR + "/confusion_matrix_best.png")
save_roc(Y_test, best_proba, FIG_DIR + "/roc_best.png")
save_pr(Y_test, best_proba, FIG_DIR + "/pr_best.png")

In [108]:
perm = permutation_importance(
    best_model, X_test, Y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring=SCORING,
    n_jobs=-1
)

imp = pd.DataFrame({
    "feature": X.columns,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

display(imp.head(15))

fig, ax = plt.subplots(figsize=(8, 5))
topk = imp.head(15).iloc[::-1]
ax.barh(topk["feature"], topk["importance_mean"], xerr=topk["importance_std"])
ax.set_title("Permutation importance (top-15)")
fig.tight_layout()
fig.savefig(FIG_DIR + "/permutation_importance_top15.png", dpi=150)
plt.close(fig)

Unnamed: 0,feature,importance_mean,importance_std
53,f54,0.02449,0.003639
24,f25,0.018275,0.004006
46,f47,0.010805,0.005613
57,f58,0.010796,0.004066
32,f33,0.009541,0.001883
37,f38,0.00913,0.002338
3,f04,0.00699,0.001899
52,f53,0.00533,0.003316
40,f41,0.004257,0.002045
15,f16,0.002557,0.001449


In [113]:
with open(ART_DIR + "/metrics_test.json", "w", encoding="utf-8") as f:
    json.dump(metrics_test, f, ensure_ascii=False, indent=2)

In [117]:
search_summaries = {
    "decision_tree": {
        "best_params": dt_search.best_params_,
        "best_cv_score_roc_auc": float(dt_search.best_score_)
    },
    "random_forest": {
        "best_params": rf_search.best_params_,
        "best_cv_score_roc_auc": float(rf_search.best_score_)
    },
    "hist_gb": {
        "best_params": hgb_search.best_params_,
        "best_cv_score_roc_auc": float(hgb_search.best_score_)
    }
}

with open(ART_DIR + "/search_summaries.json", "w", encoding="utf-8") as f:
    json.dump(search_summaries, f, ensure_ascii=False, indent=2)

In [119]:
joblib.dump(best_model, ART_DIR + "/best_model.joblib")

['artifacts/best_model.joblib']

In [127]:
best_meta = {
    "best_model_name": leader_name,
    "selection_criterion": "roc_auc",
    "test_metrics": metrics_test[leader_name],
    "best_params": search_summaries.get(leader_name, {}).get("best_params", None),
    "random_state": RANDOM_STATE,
    "test_size": 0.25
}

meta_path = ART_DIR + "/best_model_meta.json"
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(best_meta, f, ensure_ascii=False, indent=2)

In [150]:
confusion_matrix(Y_test, best_pred)

array([[5938,    5],
       [ 125,  182]], dtype=int64)