In [None]:
import pandas as pd
import numpy as np

In [None]:
DATASET = "../data/cs-training.csv"
COLUMN_TARGET = "SeriousDlqin2yrs"

df = pd.read_csv(DATASET)
df.head()

In [None]:
def cap_series(s: pd.Series, p_high=0.99):
    hi = s.quantile(p_high)
    return s.clip(upper=hi)

def main():
    df = pd.read_csv(DATASET)

# 1) Remover coluna lixo
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

# 2) Removação de idade válida
    df = df[df["age"] > 0].copy()

# 3) flags de missing + imputação
    df["MonthlyIncome_missing"] = df["MonthlyIncome"].isna().astype(int)
    df["Dependents_missing"] = df["NumberOfDependents"].isna().astype(int)

    df["MonthlyIncome"] = df["MonthlyIncome"].fillna(df["MonthlyIncome"].median())
    df["NumberOfDependents"] = df["NumberOfDependents"].fillna(0)

# 4) tratar valores anômalos (>=90) nas colunas de atraso
    late_cols = [
        "NumberOfTime30-59DaysPastDueNotWorse",
        "NumberOfTime60-89DaysPastDueNotWorse",
        "NumberOfTimes90DaysLate",
    ]

    for c in late_cols:
        if c in df.columns:
            df[f"{c}_anom90"] = (df[c] >= 90).astype(int)  # guarda o “sinal” do problema
            df.loc[df[c] >= 90, c] = np.nan               # remove o valor absurdo
            df[c] = df[c].fillna(df[c].median())          # mediana tende a ser 0

# 5) cap de outliers (p99)
    cap_cols = [
        "RevolvingUtilizationOfUnsecuredLines",
        "DebtRatio",
        "MonthlyIncome",
        "NumberOfOpenCreditLinesAndLoans",
        "NumberRealEstateLoansOrLines",
        "NumberOfDependents",
        *late_cols
    ]

    for c in cap_cols:
        if c in df.columns:
            df[c] = cap_series(df[c], p_high=0.99)

# 6) Sanity final
    print("Shape final:", df.shape)
    print("Target rate:", df[COLUMN_TARGET].mean())
    print("Missing restantes (top 10):")
    print(df.isna().sum().sort_values(ascending=False).head(10))

# 7) salvar (cuidado: NÃO subir no GitHub)
    df.to_csv("train_clean.csv", index=False)
    print("Salvo: train_clean.csv")

if __name__ == "__main__":
    main()
