In [20]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance


# Настройки
 
RANDOM_STATE = 42

ARTIFACTS_DIR = Path("artifacts")
FIGURES_DIR = ARTIFACTS_DIR / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print("Загрузка датасета...")
df = pd.read_csv("S06-hw-dataset-02.csv")

print("Первые строки датасета:")
print(df.head())

print("\nИнформация о данных:")
print(df.info())

print("\nРаспределение целевой переменной:")
print(df["target"].value_counts(normalize=True))


# Формирование X и y

X = df.drop(columns=["target", "id"])
y = df["target"]


# Train / Test split

print("\nРазделение данных на train и test...")
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y
)


# Baseline 1: DummyClassifier

print("\nОбучение DummyClassifier...")
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

dummy_pred = dummy.predict(X_test)

dummy_metrics = {
    "accuracy": accuracy_score(y_test, dummy_pred),
    "f1": f1_score(y_test, dummy_pred)
}

print("Метрики DummyClassifier:", dummy_metrics)


# Baseline 2: Logistic Regression

print("\nОбучение LogisticRegression...")
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
logreg_proba = logreg.predict_proba(X_test)[:, 1]

logreg_metrics = {
    "accuracy": accuracy_score(y_test, logreg_pred),
    "f1": f1_score(y_test, logreg_pred),
    "roc_auc": roc_auc_score(y_test, logreg_proba)
}

print("Метрики LogisticRegression:", logreg_metrics)


# Decision Tree + CV

print("\nПодбор гиперпараметров для Decision Tree...")
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)

dt_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_leaf": [1, 5, 10]
}

dt_search = GridSearchCV(
    dt,
    dt_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

dt_search.fit(X_train, y_train)
dt_best = dt_search.best_estimator_

dt_pred = dt_best.predict(X_test)
dt_proba = dt_best.predict_proba(X_test)[:, 1]

dt_metrics = {
    "accuracy": accuracy_score(y_test, dt_pred),
    "f1": f1_score(y_test, dt_pred),
    "roc_auc": roc_auc_score(y_test, dt_proba)
}

print("Лучшие параметры Decision Tree:", dt_search.best_params_)
print("Метрики Decision Tree:", dt_metrics)


# Random Forest + CV

print("\nПодбор гиперпараметров для Random Forest...")
rf = RandomForestClassifier(random_state=RANDOM_STATE)

rf_grid = {
    "n_estimators": [200],
    "max_depth": [None, 10],
    "min_samples_leaf": [1, 5],
    "max_features": ["sqrt", "log2"]
}

rf_search = GridSearchCV(
    rf,
    rf_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_

rf_pred = rf_best.predict(X_test)
rf_proba = rf_best.predict_proba(X_test)[:, 1]

rf_metrics = {
    "accuracy": accuracy_score(y_test, rf_pred),
    "f1": f1_score(y_test, rf_pred),
    "roc_auc": roc_auc_score(y_test, rf_proba)
}

print("Лучшие параметры Random Forest:", rf_search.best_params_)
print("Метрики Random Forest:", rf_metrics)


# Gradient Boosting + CV

print("\nПодбор гиперпараметров для Gradient Boosting...")
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)

gb_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3]
}

gb_search = GridSearchCV(
    gb,
    gb_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)

gb_search.fit(X_train, y_train)
gb_best = gb_search.best_estimator_

gb_pred = gb_best.predict(X_test)
gb_proba = gb_best.predict_proba(X_test)[:, 1]

gb_metrics = {
    "accuracy": accuracy_score(y_test, gb_pred),
    "f1": f1_score(y_test, gb_pred),
    "roc_auc": roc_auc_score(y_test, gb_proba)
}

print("Лучшие параметры Gradient Boosting:", gb_search.best_params_)
print("Метрики Gradient Boosting:", gb_metrics)


# Сбор всех метрик

all_metrics = {
    "dummy": dummy_metrics,
    "logistic_regression": logreg_metrics,
    "decision_tree": dt_metrics,
    "random_forest": rf_metrics,
    "gradient_boosting": gb_metrics
}


# Выбор лучшей модели по ROC-AUC

best_model_name = max(
    all_metrics,
    key=lambda k: all_metrics[k].get("roc_auc", 0)
)

print("\nЛучшая модель по ROC-AUC:", best_model_name)

best_model = {
    "logistic_regression": logreg,
    "decision_tree": dt_best,
    "random_forest": rf_best,
    "gradient_boosting": gb_best
}[best_model_name]


# ROC-кривая

print("Построение ROC-кривой...")
proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, proba)

plt.figure()
plt.plot(fpr, tpr, label="ROC-кривая")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-кривая")
plt.legend()
plt.savefig(FIGURES_DIR / "roc_curve.png")
plt.close()


# Confusion Matrix

print("Построение матрицы ошибок...")
cm = confusion_matrix(y_test, best_model.predict(X_test))
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title("Матрица ошибок")
plt.savefig(FIGURES_DIR / "confusion_matrix.png")
plt.close()


# Permutation Importance

print("Расчёт permutation importance...")
perm = permutation_importance(
    best_model,
    X_test,
    y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

importances = pd.Series(
    perm.importances_mean,
    index=X.columns
).sort_values(ascending=False)

print("Топ-10 наиболее важных признаков:")
print(importances.head(10))

# Сохранение артефактов

print("Сохранение артефактов...")

with open(ARTIFACTS_DIR / "metrics_test.json", "w", encoding="utf-8") as f:
    json.dump(all_metrics, f, indent=2, ensure_ascii=False)

search_summaries = {
    "decision_tree": dt_search.best_params_,
    "random_forest": rf_search.best_params_,
    "gradient_boosting": gb_search.best_params_
}

with open(ARTIFACTS_DIR / "search_summaries.json", "w", encoding="utf-8") as f:
    json.dump(search_summaries, f, indent=2, ensure_ascii=False)

joblib.dump(best_model, ARTIFACTS_DIR / "best_model.joblib")

best_model_meta = {
    "best_model": best_model_name,
    "metrics": all_metrics[best_model_name]
}

with open(ARTIFACTS_DIR / "best_model_meta.json", "w", encoding="utf-8") as f:
    json.dump(best_model_meta, f, indent=2, ensure_ascii=False)




Загрузка датасета...
Первые строки датасета:
   id       f01       f02       f03        f04       f05       f06       f07  \
0   1 -0.149235 -2.826966 -0.522901  -4.198449  1.364943  0.815043 -1.195518   
1   2 -1.966180 -4.877542  0.268367  -9.607791  0.097149  1.347185 -3.872575   
2   3 -0.555964 -0.999920  0.209673 -14.119498 -1.808950 -0.006222 -4.651108   
3   4 -2.049199 -5.600713 -1.664677  -6.263893 -5.224455  0.848351  1.407210   
4   5 -0.220556  4.889479 -2.235840   6.450046  0.774389 -2.382625  2.584816   

        f08       f09  ...       f29       f30       f31       f32       f33  \
0 -1.932232  2.396353  ... -0.159323  0.448015  0.572745  0.149916  0.878392   
1 -0.395117  1.710068  ... -0.389212  1.383794  0.169876  0.043969 -0.963545   
2  0.911944 -0.289037  ... -1.383970  3.044321 -0.182864  1.425649 -8.418598   
3 -0.542080  0.119102  ... -2.713080  2.762637 -0.520796 -0.142455  1.668338   
4  4.211856 -0.317889  ... -1.302872  2.478862  1.528610  1.098131  3.5470