In [1]:
# Cell 1 Environment (DNBR / SINASC)
!pip -q install numpy pandas scikit-learn xgboost==2.0.3 torch matplotlib pyyaml scipy joblib pyarrow dbfread

import os, math, json, gc, random, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass

import torch, joblib
from pathlib import Path
warnings.filterwarnings("ignore")

@dataclass
class Config:
    SEED: int = 42
    # DBF file path (DNBR 2024)
    CSV_PATH: str = r"C:\Users\Nicee\Desktop\DNBR2024.dbf"
    # output dir for this project
    OUT_DIR: str = r"C:\Users\Nicee\Desktop\kenkyu\gnamboost_dnbr_outputs"
    ROW_LIMIT: int | None = None  # e.g. 100000 for debug

CFG = Config()

def seed_everything(seed: int):
    random.seed(seed); np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG.SEED)

OUT_DIR = Path(CFG.OUT_DIR)
FIG_DIR = OUT_DIR / "figs"
INT_DIR = OUT_DIR / "interim"
for p in [FIG_DIR, INT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

CSV_PATH = CFG.CSV_PATH
row_limit = CFG.ROW_LIMIT

In [2]:
# Cell 2 Read DNBR2024.dbf
import pandas as pd, numpy as np
from dbfread import DBF

# keep only columns needed for modeling
BASE_COLS = [
    "ORIGEM","CODESTAB","CODMUNNASC","LOCNASC",
    "IDADEMAE","ESTCIVMAE","ESCMAE","ESCMAE2010","SERIESCMAE",
    "QTDFILVIVO","QTDFILMORT",
    "CODMUNRES","CODPAISRES",
    "GESTACAO","GRAVIDEZ","PARTO","CONSULTAS",
    "DTNASC","HORANASC",
    "SEXO","RACACOR","PESO",
    "APGAR1","APGAR5",
    "IDANOMAL","CODANOMAL",
    "QTDGESTANT","QTDPARTNOR","QTDPARTCES",
    "IDADEPAI",
    "SEMAGESTAC","TPMETESTIM",
    "CONSPRENAT","MESPRENAT",
    "TPAPRESENT","STTRABPART","STCESPARTO",
    "PARIDADE","KOTELCHUCK"
]

USE_COLS = list(dict.fromkeys(BASE_COLS))  # drop duplicates if any

# read DBF into DataFrame
table = DBF(CSV_PATH, load=True)  # encoding auto
df = pd.DataFrame(iter(table))

# optional row limit for fast debug
if row_limit is not None:
    df = df.head(row_limit).copy()

# keep only existing columns
usecols = [c for c in USE_COLS if c in df.columns]
df = df[usecols].copy()

# derive state from residence municipality
if "CODMUNRES" in df.columns:
    df["UF_RES"] = df["CODMUNRES"].astype(str).str.slice(0, 2)

# numeric casts for key vars
for c in ["IDADEMAE","IDADEPAI","PESO","SEMAGESTAC",
          "APGAR1","APGAR5",
          "QTDFILVIVO","QTDFILMORT",
          "QTDGESTANT","QTDPARTNOR","QTDPARTCES"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# outcome: any congenital anomaly
id_col = pd.to_numeric(df.get("IDANOMAL"), errors="coerce") if "IDANOMAL" in df.columns else np.nan
cod_col = df.get("CODANOMAL")
has_code = cod_col.astype(str).str.strip().ne("") if cod_col is not None else False
df["Y_ANOM"] = ((id_col == 1) | has_code).astype(int)

print("N:", len(df), "| anomaly rate:", df["Y_ANOM"].mean().round(4))
df.head()


N: 2384438 | anomaly rate: 0.0114


Unnamed: 0,ORIGEM,CODESTAB,CODMUNNASC,LOCNASC,IDADEMAE,ESTCIVMAE,ESCMAE,ESCMAE2010,SERIESCMAE,QTDFILVIVO,...,TPMETESTIM,CONSPRENAT,MESPRENAT,TPAPRESENT,STTRABPART,STCESPARTO,PARIDADE,KOTELCHUCK,UF_RES,Y_ANOM
0,1,2516500,110001,1,24.0,1,3,2,5.0,1.0,...,8,2,3,1,2,2,1,2,11,0
1,1,2516500,110001,1,29.0,2,5,5,,0.0,...,8,8,5,1,2,1,0,2,11,0
2,1,2516500,110001,1,30.0,5,4,3,2.0,2.0,...,8,8,2,1,2,1,1,5,11,0
3,1,2516500,110001,1,14.0,5,4,2,8.0,0.0,...,8,10,3,1,2,2,0,5,11,0
4,1,2516500,110001,1,24.0,2,4,3,1.0,1.0,...,8,11,2,1,2,2,1,5,11,0


In [3]:
# Cell 3 Assemble Feature Matrix (DNBR)
import numpy as np, pandas as pd

df = df.copy()

# numeric candidates
NUM_KEEP = [
    "IDADEMAE", "IDADEPAI",
    "PESO",
    "SEMAGESTAC",
    "QTDFILVIVO","QTDFILMORT",
    "QTDGESTANT","QTDPARTNOR","QTDPARTCES",
    "APGAR1","APGAR5",
]

# categorical candidates
CAT_KEEP = [
    "SEXO",
    "RACACOR",
    "RACACORMAE",      # only if present
    "ESCMAE2010","ESTCIVMAE",
    "GESTACAO","GRAVIDEZ","PARTO",
    "CONSPRENAT","MESPRENAT",
    "TPAPRESENT","STTRABPART","STCESPARTO",
    "PARIDADE","KOTELCHUCK",
    "UF_RES","LOCNASC","CODESTAB",
]

def _exist(cols, df):
    return [c for c in cols if c in df.columns]

# base columns always preferred in splits
base_cols = _exist(["IDADEMAE","SEXO","UF_RES","RACACOR"], df)

feature_cols = base_cols + _exist(NUM_KEEP, df) + _exist(CAT_KEEP, df)
# remove duplicates, keep order
seen = set()
feature_cols = [c for c in feature_cols if not (c in seen or seen.add(c))]

X = df[feature_cols].copy()
y = df["Y_ANOM"].astype(int).values

print("X shape:", X.shape, "| anomaly rate:", float(y.mean()))
print("n_features:", len(feature_cols))


X shape: (2384438, 28) | anomaly rate: 0.011385072708956995
n_features: 28


In [4]:
# Cell 4 Stratified split (DNBR)
import numpy as np, pandas as pd

rng = np.random.RandomState(CFG.SEED)
dfS = pd.DataFrame(index=X.index)

# age bins
dfS["AGE_BIN"] = pd.qcut(
    pd.to_numeric(df["IDADEMAE"], errors="coerce"),
    q=4, duplicates="drop"
).astype(str)

# state from residence
state_col = "UF_RES"
if state_col not in X.columns:
    raise KeyError("UF_RES col not found")
dfS["STATE"] = X[state_col].astype(str).fillna("UNK")

# parity / gravidity bin
if "QTDGESTANT" in X.columns:
    g = pd.to_numeric(X["QTDGESTANT"], errors="coerce")
    dfS["PAR_BIN"] = pd.cut(g, bins=[-1,0,1,99], labels=["0","1","2+"])
else:
    dfS["PAR_BIN"] = "NA"

# maternal education bin
edu = pd.to_numeric(df.get("ESCMAE2010"), errors="coerce") if "ESCMAE2010" in df.columns else pd.Series(np.nan, index=X.index)
dfS["EDU_BIN"] = edu.fillna(-1).astype(int).astype(str)

# race (prefer maternal)
eth_col = "RACACORMAE" if "RACACORMAE" in X.columns else ("RACACOR" if "RACACOR" in X.columns else None)
if eth_col is not None:
    dfS["ETH"] = X[eth_col].astype(str).replace({"9":"MISSING"}).fillna("MISSING")
else:
    dfS["ETH"] = "NA"

# sex of newborn
if "SEXO" in X.columns:
    dfS["SEX"] = X["SEXO"].astype(str).replace({"9":"MISSING"}).fillna("MISSING")
else:
    dfS["SEX"] = "NA"

# combine strata
dfS["STRATA_RAW"] = dfS[["AGE_BIN","STATE","PAR_BIN","EDU_BIN","ETH","SEX"]].astype(str).agg("|".join, axis=1)
ct = dfS["STRATA_RAW"].value_counts()

MIN_PER = 25
rare = set(ct[ct < MIN_PER].index)
dfS["STRATA"] = np.where(dfS["STRATA_RAW"].isin(rare), "RARE", dfS["STRATA_RAW"])

def stratified_split(X, y, strata, test_size=0.2, valid_size=0.2, seed=42):
    rng = np.random.RandomState(seed)
    df_idx = pd.DataFrame({"idx": np.arange(len(X)), "strata": strata})
    test_idx, valid_idx, train_idx = [], [], []
    for s, sub in df_idx.groupby("strata"):
        ids = sub["idx"].values
        rng.shuffle(ids)
        n = len(ids)
        n_test  = int(round(n * test_size))
        n_valid = int(round((n - n_test) * valid_size))
        test_idx.extend(ids[:n_test])
        valid_idx.extend(ids[n_test:n_test+n_valid])
        train_idx.extend(ids[n_test+n_valid:])
    return (X.iloc[train_idx], X.iloc[valid_idx], X.iloc[test_idx],
            y[train_idx], y[valid_idx], y[test_idx])

Xtr, Xva, Xte, ytr, yva, yte = stratified_split(X, y, dfS["STRATA"], test_size=0.20, valid_size=0.20, seed=CFG.SEED)

print("train:", len(Xtr), "val:", len(Xva), "test:", len(Xte))
print("anomaly rate:", ytr.mean().round(4), yva.mean().round(4), yte.mean().round(4))


train: 1526106 val: 381469 test: 476863
anomaly rate: 0.0113 0.0114 0.0117


In [5]:
# Cell 5 Feature engineering (DNBR)
import pandas as pd, numpy as np

Xtr, Xva, Xte = Xtr.copy(), Xva.copy(), Xte.copy()

CAT_CAND = [
    "SEXO","RACACOR","RACACORMAE",
    "ESCMAE2010","ESTCIVMAE",
    "GESTACAO","GRAVIDEZ","PARTO",
    "CONSPRENAT","MESPRENAT",
    "TPAPRESENT","STTRABPART","STCESPARTO",
    "PARIDADE","KOTELCHUCK",
    "UF_RES","LOCNASC","CODESTAB",
]

NUM_CAND = [
    "IDADEMAE","IDADEPAI",
    "PESO",
    "SEMAGESTAC",
    "QTDFILVIVO","QTDFILMORT",
    "QTDGESTANT","QTDPARTNOR","QTDPARTCES",
    "APGAR1","APGAR5",
]

BIN_EXTRA = ["LBW","PRETERM","MA35PLUS"]

def _exist(cols, df):
    return [c for c in cols if c in df.columns]

def _mk_binaries(df):
    # low birth weight
    if "PESO" in df.columns:
        w = pd.to_numeric(df["PESO"], errors="coerce")
        df["LBW"] = (w < 2500).astype(float)
    # preterm
    if "SEMAGESTAC" in df.columns:
        g = pd.to_numeric(df["SEMAGESTAC"], errors="coerce")
        df["PRETERM"] = (g < 37).astype(float)
    # maternal age ≥35
    if "IDADEMAE" in df.columns:
        a = pd.to_numeric(df["IDADEMAE"], errors="coerce")
        df["MA35PLUS"] = (a >= 35).astype(float)

for d in (Xtr, Xva, Xte):
    for c in NUM_CAND:
        if c in d.columns:
            d[c] = pd.to_numeric(d[c], errors="coerce")
    _mk_binaries(d)

num_cols = _exist(NUM_CAND, Xtr)
bin_cols = _exist(BIN_EXTRA, Xtr)
cat_cols = _exist(CAT_CAND, Xtr)
cat_cols = [c for c in cat_cols if Xtr[c].nunique(dropna=True) <= 80]

feature_cols = list(dict.fromkeys(num_cols + bin_cols + cat_cols))
print(f"[FE] num={len(num_cols)}, bin={len(bin_cols)}, cat={len(cat_cols)}, total={len(feature_cols)}")


[FE] num=11, bin=3, cat=16, total=30


In [6]:
# Cell 6 Preprocessing (Imputation, Scaling, OHE) for DNBR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
    ("scaler", StandardScaler())
])

bin_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=10)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("bin", bin_pipe, bin_cols),
    ("cat", cat_pipe, cat_cols),
], remainder="drop")

Xtr_t = preproc.fit_transform(Xtr[feature_cols])
Xva_t = preproc.transform(Xva[feature_cols])
Xte_t = preproc.transform(Xte[feature_cols])

try:
    cat_names = list(preproc.named_transformers_["cat"].named_steps["ohe"].get_feature_names_out(cat_cols))
except Exception:
    cat_names = []

print("Train transformed:", Xtr_t.shape, "Val:", Xva_t.shape, "Test:", Xte_t.shape)


Train transformed: (1526106, 177) Val: (381469, 177) Test: (476863, 177)


In [7]:
# Cell 7 Quick check and persist
def coverage(df_in, cols):
    rows = []
    for c in cols:
        if c in df_in.columns:
            s = df_in[c]
            nn = s.notna().mean()
            uniq = s.nunique(dropna=True)
            rows.append([
                c,
                nn,
                uniq,
                np.nan if s.dropna().empty else float(
                    np.nanmean(pd.to_numeric(s, errors="coerce"))
                )
            ])
        else:
            rows.append([c, 0.0, 0, np.nan])
    return pd.DataFrame(
        rows,
        columns=["feature","non_null_rate","n_unique","mean_like"]
    ).sort_values("non_null_rate")

key_cols = ["IDADEMAE","SEMAGESTAC","PESO","QTDGESTANT","QTDFILVIVO","QTDFILMORT","APGAR1","APGAR5"]
print("Train coverage:"); display(coverage(Xtr, key_cols))
print("Valid coverage:"); display(coverage(Xva, key_cols))
print("Test  coverage:"); display(coverage(Xte, key_cols))

meta = {
    "feature_cols": feature_cols,
    "num_cols": num_cols,
    "bin_cols": bin_cols,
    "cat_cols": cat_cols,
    "cat_names": cat_names,
    "seed": CFG.SEED,
}

joblib.dump(Xtr,   INT_DIR/"Xtr.pkl")
joblib.dump(Xva,   INT_DIR/"Xva.pkl")
joblib.dump(Xte,   INT_DIR/"Xte.pkl")
joblib.dump(ytr,   INT_DIR/"ytr.npy")
joblib.dump(yva,   INT_DIR/"yva.npy")
joblib.dump(yte,   INT_DIR/"yte.npy")

joblib.dump(Xtr_t, INT_DIR/"Xtr_t.npy")
joblib.dump(Xva_t, INT_DIR/"Xva_t.npy")
joblib.dump(Xte_t, INT_DIR/"Xte_t.npy")

joblib.dump(preproc, INT_DIR/"preproc.joblib")
with open(INT_DIR/"meta.json","w",encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

df_min = df.copy()
df_min.to_parquet(INT_DIR/"df_filtered.parquet", index=False)
print("Artifacts saved to:", INT_DIR)


Train coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
5,QTDFILMORT,0.978206,25,0.282094
3,QTDGESTANT,0.98282,39,1.283872
4,QTDFILVIVO,0.985773,22,1.018133
6,APGAR1,0.990079,12,8.380118
7,APGAR5,0.990099,12,9.335105
1,SEMAGESTAC,0.991259,27,38.246168
2,PESO,0.999915,4518,3149.106196
0,IDADEMAE,0.999989,57,27.761518


Valid coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
5,QTDFILMORT,0.978205,16,0.280532
3,QTDGESTANT,0.982657,26,1.282183
4,QTDFILVIVO,0.985894,20,1.01848
6,APGAR1,0.990185,12,8.383618
7,APGAR5,0.99024,12,9.333285
1,SEMAGESTAC,0.991205,27,38.248028
2,PESO,0.999916,3658,3150.913975
0,IDADEMAE,0.999997,52,27.76603


Test  coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
5,QTDFILMORT,0.978052,20,0.280814
3,QTDGESTANT,0.982775,31,1.281496
4,QTDFILVIVO,0.985698,18,1.016645
6,APGAR1,0.990064,12,8.385381
7,APGAR5,0.990125,12,9.340046
1,SEMAGESTAC,0.991203,27,38.252247
2,PESO,0.999901,3814,3151.921699
0,IDADEMAE,0.999994,54,27.759789


Artifacts saved to: C:\Users\Nicee\Desktop\kenkyu\gnamboost_dnbr_outputs\interim


In [8]:
# Cell 8: Save artifacts for modeling notebook

import json, joblib
from pathlib import Path

OUT_DIR = Path(r"C:\Users\Nicee\Desktop\kenkyu\gnamboost_dnbr_outputs")
INT_DIR = OUT_DIR / "interim"
OUT_DIR.mkdir(parents=True, exist_ok=True)
INT_DIR.mkdir(parents=True, exist_ok=True)

# X / y and transformed X
joblib.dump(Xtr,   INT_DIR / "Xtr.pkl")
joblib.dump(Xva,   INT_DIR / "Xva.pkl")
joblib.dump(Xte,   INT_DIR / "Xte.pkl")
joblib.dump(ytr,   INT_DIR / "ytr.npy")
joblib.dump(yva,   INT_DIR / "yva.npy")
joblib.dump(yte,   INT_DIR / "yte.npy")
joblib.dump(Xtr_t, INT_DIR / "Xtr_t.npy")
joblib.dump(Xva_t, INT_DIR / "Xva_t.npy")
joblib.dump(Xte_t, INT_DIR / "Xte_t.npy")

# preprocessing object
joblib.dump(preproc, INT_DIR / "preproc.joblib")

# meta info
meta = {
    "feature_cols": feature_cols,
    "num_cols": num_cols,
    "bin_cols": bin_cols,
    "cat_cols": cat_cols,
    "seed": CFG.SEED,
}
with open(INT_DIR / "meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

# store filtered cohort
df_min = df.copy()
df_min.to_parquet(INT_DIR / "df_filtered.parquet", index=False)

print("Artifacts saved to:", INT_DIR)


Artifacts saved to: C:\Users\Nicee\Desktop\kenkyu\gnamboost_dnbr_outputs\interim
