# 03. Эксперименты с моделями

## Сравнение нескольких моделей (пример структуры)
Можно расширить подбором параметров и кросс-проверкой.

In [None]:

import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

DATA_PATH = Path("../data/GiveMeSomeCredit-training.csv")  # поменяй под себя
df = pd.read_csv(DATA_PATH)
for c in df.columns:
    if c.lower().startswith("unnamed"):
        df = df.drop(columns=[c])
TARGET = "SeriousDlqin2yrs"
df = df[(df["age"] >= 18) & (df["age"] <= 120)].copy()

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df[TARGET], random_state=42)
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET].astype(int)
X_test  = test_df.drop(columns=[TARGET])
y_test  = test_df[TARGET].astype(int)

models = {
    "Логистическая регрессия": Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
        ("m", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
    ]),
    "Случайный лес": Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("m", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
    ]),
    "Градиентный бустинг (sklearn)": Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("m", HistGradientBoostingClassifier(random_state=42))
    ]),
}

rows = []
for name, m in models.items():
    m.fit(X_train, y_train)
    proba = m.predict_proba(X_test)[:, 1]
    rows.append({"модель": name, "ROC-AUC": roc_auc_score(y_test, proba)})
pd.DataFrame(rows).sort_values("ROC-AUC", ascending=False)
