In [30]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score,
    ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)

from sklearn.inspection import permutation_importance
import joblib

RANDOM_STATE = 42
DATA_PATH = "S06-hw-dataset-04.csv"

import os
from pathlib import Path

# Папка, где лежит текущий HW06.ipynb
NOTEBOOK_DIR = Path().resolve()
os.chdir(NOTEBOOK_DIR)

print("cwd fixed to:", os.getcwd())

ART_DIR = Path("artifacts")
FIG_DIR = ART_DIR / "figures"
ART_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

cwd fixed to: /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06


In [6]:
df = pd.read_csv(DATA_PATH)

display(df.head())
df.info()
display(df.describe(include="all").T)

# баланс классов
display(df["target"].value_counts())
display(df["target"].value_counts(normalize=True))

# пропуски
display(df.isna().sum().sort_values(ascending=False).head(20))

Unnamed: 0,id,f01,f02,f03,f04,f05,f06,f07,f08,f09,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
0,1,-1.25021,1.423474,-0.225004,-4.023138,-0.832729,-0.550874,1.77209,2.76169,-0.69875,...,10.938269,0.501178,1.600001,0.314212,1.209735,1.355697,-5.338924,1.153944,-0.153934,0
1,2,0.074328,0.376429,0.212831,-0.502074,2.017405,0.625496,1.943785,1.24203,-0.52409,...,7.775262,-4.550195,6.272586,-0.932162,-0.228543,1.73522,-3.827828,0.292165,0.27372,0
2,3,0.638481,0.060968,0.74676,2.479653,-0.292858,-0.078139,-2.918423,-0.013186,1.009135,...,-4.448447,-9.593179,-3.093519,0.029321,0.605511,0.829103,-0.085985,2.891408,0.766221,0
3,4,1.712916,-1.350969,-0.256473,1.622074,-0.445141,0.911932,-3.440345,1.505192,-1.104348,...,-1.619072,-3.237479,-5.474038,-1.582475,0.198137,3.823409,0.880395,1.14861,0.136732,0
4,5,0.905676,-0.206545,-0.068806,4.086026,-1.010045,-0.772644,-4.207688,2.506104,1.589143,...,-2.396844,-10.540129,-5.532811,-1.231203,0.000119,4.298572,-1.558235,0.924673,0.111668,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 62 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      25000 non-null  int64  
 1   f01     25000 non-null  float64
 2   f02     25000 non-null  float64
 3   f03     25000 non-null  float64
 4   f04     25000 non-null  float64
 5   f05     25000 non-null  float64
 6   f06     25000 non-null  float64
 7   f07     25000 non-null  float64
 8   f08     25000 non-null  float64
 9   f09     25000 non-null  float64
 10  f10     25000 non-null  float64
 11  f11     25000 non-null  float64
 12  f12     25000 non-null  float64
 13  f13     25000 non-null  float64
 14  f14     25000 non-null  float64
 15  f15     25000 non-null  float64
 16  f16     25000 non-null  float64
 17  f17     25000 non-null  float64
 18  f18     25000 non-null  float64
 19  f19     25000 non-null  float64
 20  f20     25000 non-null  float64
 21  f21     25000 non-null  float64
 22

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,25000.0,12500.500000,7217.022701,1.000000,6250.750000,12500.500000,18750.250000,25000.000000
f01,25000.0,-0.000386,1.001623,-4.370993,-0.680165,0.001859,0.679702,4.208888
f02,25000.0,-0.004872,0.995606,-4.087073,-0.675100,-0.000247,0.659523,3.984564
f03,25000.0,0.003202,1.004367,-4.103875,-0.675426,0.013272,0.683437,3.793442
f04,25000.0,0.335329,3.207537,-13.249937,-1.750048,0.403483,2.486453,15.288250
...,...,...,...,...,...,...,...,...
f57,25000.0,0.893365,2.445185,-9.508509,-0.735473,0.888535,2.516790,11.880651
f58,25000.0,-0.909479,1.962618,-7.919287,-2.226959,-0.923354,0.395648,6.778980
f59,25000.0,0.000570,0.994320,-4.038312,-0.666367,0.004381,0.666474,3.834922
f60,25000.0,-0.000754,0.997167,-3.812255,-0.665861,0.002420,0.665918,4.012639


target
0    23770
1     1230
Name: count, dtype: int64

target
0    0.9508
1    0.0492
Name: proportion, dtype: float64

id     0
f46    0
f33    0
f34    0
f35    0
f36    0
f37    0
f38    0
f39    0
f40    0
f41    0
f42    0
f43    0
f44    0
f45    0
f47    0
f01    0
f48    0
f49    0
f50    0
dtype: int64

In [7]:
X = df.drop(columns=["target", "id"])
y = df["target"]

X.shape, y.shape

((25000, 60), (25000,))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(target
 0    0.9508
 1    0.0492
 Name: proportion, dtype: float64,
 target
 0    0.9508
 1    0.0492
 Name: proportion, dtype: float64)

In [9]:
def eval_binary(clf, X_eval, y_eval):
    y_pred = clf.predict(X_eval)

    out = {
        "accuracy": float(accuracy_score(y_eval, y_pred)),
        "f1": float(f1_score(y_eval, y_pred)),
    }

    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(X_eval)[:, 1]
        out["roc_auc"] = float(roc_auc_score(y_eval, y_score))
        out["average_precision"] = float(average_precision_score(y_eval, y_score))  # AP [web:24]
    elif hasattr(clf, "decision_function"):
        y_score = clf.decision_function(X_eval)
        out["roc_auc"] = float(roc_auc_score(y_eval, y_score))
        out["average_precision"] = float(average_precision_score(y_eval, y_score))  # AP [web:24]

    return out

In [10]:
dummy = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)  # [web:32]
dummy.fit(X_train, y_train)

metrics_dummy = eval_binary(dummy, X_test, y_test)
metrics_dummy

{'accuracy': 0.9508, 'f1': 0.0, 'roc_auc': 0.5, 'average_precision': 0.0492}

In [11]:
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
])
logreg.fit(X_train, y_train)

metrics_logreg = eval_binary(logreg, X_test, y_test)
metrics_logreg

{'accuracy': 0.9632,
 'f1': 0.42857142857142855,
 'roc_auc': 0.8339874679773299,
 'average_precision': 0.508831200802871}

In [13]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

# CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)  # reproducible folds [web:73]

# несколько метрик, refit по AP (average_precision) — удобно для дисбаланса
scoring = {
    "ap": "average_precision",
    "roc_auc": "roc_auc",
    "f1": "f1",
    "accuracy": "accuracy",
}

searches = {}

# -------------------------
# 1) Decision Tree
# -------------------------
tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree_params = {
    "max_depth": [3, 5, None],
    "min_samples_leaf": [5, 20, 50],
    "ccp_alpha": [0.0, 1e-3, 1e-2],
}
tree_gs = GridSearchCV(
    estimator=tree,
    param_grid=tree_params,
    scoring=scoring,
    refit="ap",
    cv=cv,
    n_jobs=-1,
    verbose=2,
)  # [web:25]
tree_gs.fit(X_train, y_train)
searches["DecisionTree"] = tree_gs


# -------------------------
# 2) Random Forest (ускоренная версия)
# -------------------------
rf = RandomForestClassifier(
    n_estimators=150,          # было 400 — это сильно замедляет
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_params = {
    "max_depth": [None, 10],
    "min_samples_leaf": [5, 20],
    "max_features": ["sqrt", 0.5],
}

rf_gs = GridSearchCV(
    estimator=rf,
    param_grid=rf_params,
    scoring=scoring,
    refit="ap",
    cv=cv,
    n_jobs=-1,
    verbose=2,
)  # [web:25]
rf_gs.fit(X_train, y_train)
searches["RandomForest"] = rf_gs


# -------------------------
# 3) Boosting: HistGradientBoosting (небольшая сетка)
# -------------------------
hgb = HistGradientBoostingClassifier(random_state=RANDOM_STATE)

hgb_params = {
    "max_depth": [3, None],
    "learning_rate": [0.03, 0.1],
    "max_leaf_nodes": [31, 63],
}

hgb_gs = GridSearchCV(
    estimator=hgb,
    param_grid=hgb_params,
    scoring=scoring,
    refit="ap",
    cv=cv,
    n_jobs=-1,
    verbose=2,
)  # [web:25]
hgb_gs.fit(X_train, y_train)
searches["HistGB"] = hgb_gs


# Лучшие параметры по каждой модели
best_params = {k: v.best_params_ for k, v in searches.items()}
best_params

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END .....ccp_alpha=0.0, max_depth=3, min_samples_leaf=5; total time=   0.4s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=20; total time=   0.4s
[CV] END .....ccp_alpha=0.0, max_depth=3, min_samples_leaf=5; total time=   0.5s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=20; total time=   0.5s
[CV] END .....ccp_alpha=0.0, max_depth=3, min_samples_leaf=5; total time=   0.5s
[CV] END .....ccp_alpha=0.0, max_depth=3, min_samples_leaf=5; total time=   0.5s
[CV] END .....ccp_alpha=0.0, max_depth=3, min_samples_leaf=5; total time=   0.5s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=20; total time=   0.5s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=50; total time=   0.4s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=20; total time=   0.4s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_samples_leaf=50; total time=   0.4s
[CV] END ....ccp_alpha=0.0, max_depth=3, min_sa

{'DecisionTree': {'ccp_alpha': 0.0, 'max_depth': None, 'min_samples_leaf': 20},
 'RandomForest': {'max_depth': None,
  'max_features': 'sqrt',
  'min_samples_leaf': 5},
 'HistGB': {'learning_rate': 0.1, 'max_depth': None, 'max_leaf_nodes': 31}}

In [14]:
metrics_test = {
    "Dummy_most_frequent": metrics_dummy,
    "LogReg": metrics_logreg,
}

for name, gs in searches.items():
    best_est = gs.best_estimator_
    best_est.fit(X_train, y_train)
    metrics_test[name] = eval_binary(best_est, X_test, y_test)

pd.DataFrame(metrics_test).T.sort_values("average_precision", ascending=False)

Unnamed: 0,accuracy,f1,roc_auc,average_precision
HistGB,0.9792,0.737374,0.886511,0.782744
RandomForest,0.9716,0.594286,0.901571,0.775024
DecisionTree,0.9672,0.59,0.80102,0.543722
LogReg,0.9632,0.428571,0.833987,0.508831
Dummy_most_frequent,0.9508,0.0,0.5,0.0492


In [15]:
best_name = pd.DataFrame(metrics_test).T["average_precision"].idxmax()
best_model = searches[best_name].best_estimator_ if best_name in searches else logreg
best_model.fit(X_train, y_train)

y_score = best_model.predict_proba(X_test)[:, 1]

# PR curve [web:37]
PrecisionRecallDisplay.from_predictions(y_test, y_score)  # [web:37]
plt.title(f"PR curve: {best_name}")
plt.tight_layout()
plt.savefig(FIG_DIR / f"pr_curve_{best_name}.png", dpi=150)
plt.close()

# ROC curve
RocCurveDisplay.from_predictions(y_test, y_score)
plt.title(f"ROC curve: {best_name}")
plt.tight_layout()
plt.savefig(FIG_DIR / f"roc_curve_{best_name}.png", dpi=150)
plt.close()

# Confusion matrix при пороге 0.5 [web:41]
y_pred = (y_score >= 0.5).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)  # [web:41]
plt.title(f"Confusion matrix (thr=0.5): {best_name}")
plt.tight_layout()
plt.savefig(FIG_DIR / f"cm_{best_name}.png", dpi=150)
plt.close()

In [16]:
pi = permutation_importance(
    best_model, X_test, y_test,
    scoring="average_precision",
    n_repeats=20,
    random_state=RANDOM_STATE,
    n_jobs=-1
)  # [web:16]

imp = pd.DataFrame({
    "feature": X_test.columns,
    "importance_mean": pi.importances_mean,
    "importance_std": pi.importances_std,
}).sort_values("importance_mean", ascending=False)

imp.head(10)

Unnamed: 0,feature,importance_mean,importance_std
24,f25,0.152062,0.014139
57,f58,0.147448,0.01103
53,f54,0.105599,0.010028
37,f38,0.097966,0.010274
46,f47,0.07363,0.010474
52,f53,0.06333,0.005821
3,f04,0.052558,0.005498
32,f33,0.042,0.005886
12,f13,0.032844,0.004468
10,f11,0.030603,0.003985


In [22]:
# metrics_test.json
with open(ART_DIR / "metrics_test.json", "w", encoding="utf-8") as f:
    json.dump(metrics_test, f, ensure_ascii=False, indent=2)

# search_summaries.json
search_summaries = {
    name: {
        "best_params": gs.best_params_,
        "best_cv_ap": float(gs.best_score_),
    }
    for name, gs in searches.items()
}
with open(ART_DIR / "search_summaries.json", "w", encoding="utf-8") as f:
    json.dump(search_summaries, f, ensure_ascii=False, indent=2)

# best_model.joblib + meta
joblib.dump(best_model, ART_DIR / "best_model.joblib")

best_meta = {
    "best_model_name": best_name,
    "refit_metric": "average_precision",
    "test_metrics": metrics_test[best_name],
}
with open(ART_DIR / "best_model_meta.json", "w", encoding="utf-8") as f:
    json.dump(best_meta, f, ensure_ascii=False, indent=2)

# permutation importance
imp.to_csv(ART_DIR / "permutation_importance.csv", index=False)

In [29]:
import os
from pathlib import Path

print("cwd =", os.getcwd())
print("artifacts exists =", Path("artifacts").exists())
print("artifacts abs path =", Path("artifacts").resolve())

cwd = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06
artifacts exists = True
artifacts abs path = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06/artifacts


In [27]:
from pathlib import Path
import os

print("cwd =", os.getcwd())
print("notebook folder guess =", Path().resolve())

cwd = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06
notebook folder guess = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06


In [28]:
import os
from pathlib import Path
print("cwd =", os.getcwd())
print("HW06 folder exists =", Path(".").resolve())
print("artifacts exists =", Path("artifacts").exists())
print("figures exists =", Path("artifacts/figures").exists())


cwd = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06
HW06 folder exists = /Users/lejlakacaeva/Desktop/aie-leyla-2025/homeworks/HW06
artifacts exists = True
figures exists = True
