In [None]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    roc_curve,
    precision_score,
    recall_score,
    f1_score,
)

import matplotlib.pyplot as plt


# 1. Загрузка данных
DATA_PATH = "S05-hw-dataset.csv"  # если файл лежит рядом с HW05.ipynb

df = pd.read_csv(DATA_PATH)

print("Первые строки датасета:")
display(df.head())

print("\nИнформация о датасете:")
display(df.info())

print("\nОписательные статистики:")
display(df.describe())

print("\nРаспределение таргета default:")
print(df["default"].value_counts(normalize=True))


# 2. Подготовка признаков и таргета
y = df["default"]
feature_cols = [col for col in df.columns if col not in ["default", "client_id"]]
X = df[feature_cols]

print("\nРазмеры матрицы признаков и таргета:", X.shape, y.shape)


# 3. Train/Test-сплит
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


# 4. Бейзлайн: DummyClassifier
dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)

y_pred_dummy = dummy.predict(X_test)
y_proba_dummy = dummy.predict_proba(X_test)[:, 1]

acc_dummy = accuracy_score(y_test, y_pred_dummy)
roc_auc_dummy = roc_auc_score(y_test, y_proba_dummy)

print("\n=== Бейзлайн: DummyClassifier ===")
print(f"accuracy: {acc_dummy:.3f}")
print(f"ROC-AUC:  {roc_auc_dummy:.3f}")


# 5. Логистическая регрессия в Pipeline + подбор C
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000, random_state=42)),
    ]
)

C_grid = [0.01, 0.1, 1.0, 10.0]

results = []

for C in C_grid:
    pipe.set_params(logreg__C=C)
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    results.append({"C": C, "accuracy": acc, "roc_auc": roc_auc})

results_df = pd.DataFrame(results)
print("\nРезультаты логистической регрессии для разных C:")
display(results_df)

best_row = results_df.sort_values("roc_auc", ascending=False).iloc[0]
best_C = best_row["C"]
print(f"\nЛучшее значение C по ROC-AUC: {best_C}")


# 6. Обучаем лучшую модель и считаем метрики
pipe.set_params(logreg__C=best_C)
pipe.fit(X_train, y_train)

y_pred_lr = pipe.predict(X_test)
y_proba_lr = pipe.predict_proba(X_test)[:, 1]

acc_lr = accuracy_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_proba_lr)
prec_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print("\n=== Логистическая регрессия (лучшая модель) ===")
print(f"accuracy:  {acc_lr:.3f}")
print(f"ROC-AUC:   {roc_auc_lr:.3f}")
print(f"precision: {prec_lr:.3f}")
print(f"recall:    {recall_lr:.3f}")
print(f"F1:        {f1_lr:.3f}")


# 7. ROC-кривые и сохранение графика
os.makedirs("figures", exist_ok=True)

fpr_dummy, tpr_dummy, _ = roc_curve(y_test, y_proba_dummy)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)

plt.figure(figsize=(6, 6))
plt.plot(fpr_dummy, tpr_dummy, label=f"Dummy (AUC = {roc_auc_dummy:.3f})")
plt.plot(fpr_lr, tpr_lr, label=f"LogReg (AUC = {roc_auc_lr:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-кривые на тестовой выборке")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("figures/roc_curves.png")
plt.show()


# 8. Сводная таблица сравнения моделей
compare_df = pd.DataFrame(
    [
        {
            "model": "Dummy(most_frequent)",
            "accuracy": acc_dummy,
            "roc_auc": roc_auc_dummy,
        },
        {
            "model": f"LogReg(C={best_C})",
            "accuracy": acc_lr,
            "roc_auc": roc_auc_lr,
        },
    ]
)

print("\nСравнение моделей:")
display(compare_df)


ModuleNotFoundError: No module named 'sklearn'