# 02. Базовая модель

## Обучение логистической регрессии и расчет метрик

In [None]:

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

# Если загружал full в прошлом ноутбуке - прочитай из файла/БД. Здесь ожидаем full в памяти.
# Для автономности:
from pathlib import Path
DATA_PATH = Path("../data/GiveMeSomeCredit-training.csv")  # поменяй под себя
df = pd.read_csv(DATA_PATH)
for c in df.columns:
    if c.lower().startswith("unnamed"):
        df = df.drop(columns=[c])
TARGET = "SeriousDlqin2yrs"
df = df[(df["age"] >= 18) & (df["age"] <= 120)].copy()
df["age_group"] = pd.cut(df["age"], bins=[17,24,34,44,54,64,200],
                         labels=["18-24","25-34","35-44","45-54","55-64","65+"], right=True)
for col in ["MonthlyIncome","NumberOfDependents"]:
    df[f"is_missing_{col}"] = df[col].isna().astype(int)

from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df[TARGET], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df[TARGET], random_state=42)

X_train = train_df.drop(columns=[TARGET, "age_group"], errors="ignore")
y_train = train_df[TARGET].astype(int)
X_valid = valid_df.drop(columns=[TARGET, "age_group"], errors="ignore")
y_valid = valid_df[TARGET].astype(int)
X_test  = test_df.drop(columns=[TARGET, "age_group"], errors="ignore")
y_test  = test_df[TARGET].astype(int)

model = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
])

model.fit(X_train, y_train)

proba_valid = model.predict_proba(X_valid)[:, 1]
proba_test  = model.predict_proba(X_test)[:, 1]

print("ROC-AUC (valid):", roc_auc_score(y_valid, proba_valid))
print("ROC-AUC (test):", roc_auc_score(y_test, proba_test))
print("PR-AUC  (test):", average_precision_score(y_test, proba_test))
print("Brier  (test):", brier_score_loss(y_test, proba_test))


## Выбор порога по условию полноты (например, ≥ 0.80)

In [None]:

from sklearn.metrics import precision_recall_curve
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_valid, proba_valid)
target_recall = 0.80
idx = np.where(recall >= target_recall)[0][0]
THRESHOLD_FINAL = float(thresholds[idx - 1]) if idx > 0 else 1.0
print("THRESHOLD_FINAL =", THRESHOLD_FINAL)


## Сохранение модели

In [None]:

import joblib
from pathlib import Path

Path("../models").mkdir(parents=True, exist_ok=True)
joblib.dump(model, "../models/logreg_baseline.joblib")
print("Сохранено: models/logreg_baseline.joblib")
