# 01. Подготовка данных

## Загрузка данных и первичная подготовка
Датасет: Give Me Some Credit (`GiveMeSomeCredit-training.csv`).

In [None]:

import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/GiveMeSomeCredit-training.csv")  # поменяй под себя
df = pd.read_csv(DATA_PATH)

# удалить служебный столбец, если есть
for c in df.columns:
    if c.lower().startswith("unnamed"):
        df = df.drop(columns=[c])

TARGET = "SeriousDlqin2yrs"
df = df[(df["age"] >= 18) & (df["age"] <= 120)].copy()

df["age_group"] = pd.cut(
    df["age"],
    bins=[17, 24, 34, 44, 54, 64, 200],
    labels=["18-24", "25-34", "35-44", "45-54", "55-64", "65+"],
    right=True
)

# индикаторы пропусков
for col in ["MonthlyIncome", "NumberOfDependents"]:
    if col in df.columns:
        df[f"is_missing_{col}"] = df[col].isna().astype(int)

df.head()


## Разбиение на обучающую/проверочную/тестовую выборки (70/15/15)

In [None]:

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df[TARGET], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df[TARGET], random_state=42)

train_df["split"] = "train"
valid_df["split"] = "valid"
test_df["split"] = "test"

full = pd.concat([train_df, valid_df, test_df], ignore_index=True)
full["split"].value_counts()


## Экспорт в PostgreSQL (опционально)
Заполни строку подключения и выгрузи таблицу `raw.credit` или сразу `features.credit`.

In [None]:

# from sqlalchemy import create_engine
# engine = create_engine("postgresql://USER:PASSWORD@HOST:PORT/DBNAME")
# full.to_sql("credit", engine, schema="raw", if_exists="replace", index=False)
# print("Загружено в raw.credit")
