In [1]:
# Cell 1 Environment
!pip -q install numpy pandas scikit-learn xgboost==2.0.3 torch matplotlib pyyaml scipy joblib pyarrow

import os, math, json, gc, random, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass

import torch, joblib
from pathlib import Path
warnings.filterwarnings("ignore")

@dataclass
class Config:
    SEED: int = 42
    CSV_PATH: str = r"C:\Users\Nicee\Desktop\kenkyu\INFLUD20-26-06-2025.csv"
    OUT_DIR: str = r"C:\Users\Nicee\Desktop\kenkyu\gnamboost_outputs"
    ROW_LIMIT: int | None = None

CFG = Config()

def seed_everything(seed: int):
    random.seed(seed); np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(CFG.SEED)

OUT_DIR = Path(CFG.OUT_DIR)
FIG_DIR = OUT_DIR / "figs"
INT_DIR = OUT_DIR / "interim"
for p in [FIG_DIR, INT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

CSV_PATH = CFG.CSV_PATH
row_limit = CFG.ROW_LIMIT


In [2]:
# Cell 2 Read
import pandas as pd, numpy as np

SYMPT  = ["FEBRE","TOSSE","GARGANTA","DISPNEIA","DESC_RESP","DIARREIA","VOMITO","DOR_ABD","FADIGA","PERD_OLFT","PERD_PALA","OUTRO_SIN"]
COMORB = ["CARDIOPATI","HEMATOLOGI","HEPATICA","ASMA","DIABETES","NEUROLOGIC","PNEUMOPATI","IMUNODEPRE","RENAL","OBESIDADE","TABAG"]
EXTRA_KEYS = ["CS_ESCOL_N","CO_UNI_NOT","SEM_NOT","DT_SIN_PRI","DT_NASC","CO_MUN_RES","CO_MU_INTE"]

USE_COLS = ["DT_INTERNA","EVOLUCAO","CLASSI_FIN","PCR_SARS2","HOSPITAL","NU_IDADE_N","CS_SEXO","CS_RACA","SG_UF_NOT","SG_UF_RES"] + SYMPT + COMORB + EXTRA_KEYS

peek = pd.read_csv(CSV_PATH, sep=';', nrows=5, low_memory=False)
usecols = [c for c in USE_COLS if c in peek.columns]
df = pd.read_csv(CSV_PATH, sep=';', usecols=usecols, low_memory=False, nrows=row_limit)

df["UF"] = np.where(df.get("SG_UF_RES").notna() if "SG_UF_RES" in df.columns else False,
                    df.get("SG_UF_RES"), df.get("SG_UF_NOT"))
df["DT_INTERNA"] = pd.to_datetime(df["DT_INTERNA"], errors="coerce")


In [3]:
# Cell 3 Build Overload
import numpy as np, pandas as pd

def ensure_epi_week(df0):
    df = df0.copy()
    if "SEM_NOT" in df.columns:
        df["EPI_WEEK"] = pd.to_numeric(df["SEM_NOT"], errors="coerce")
    elif "DT_SIN_PRI" in df.columns:
        tmp = pd.to_datetime(df["DT_SIN_PRI"], errors="coerce", dayfirst=True)
        df["EPI_WEEK"] = tmp.dt.isocalendar().week.astype("Int64")
    else:
        df["EPI_WEEK"] = df["DT_INTERNA"].dt.isocalendar().week.astype("Int64")
    return df

def build_overload(df0):
    df = ensure_epi_week(df0)
    key = ["CO_UNI_NOT","EPI_WEEK"] if set(["CO_UNI_NOT","EPI_WEEK"]).issubset(df.columns) else ["UF","EPI_WEEK"]
    g = df.groupby(key).size().reset_index(name="Times").sort_values(key)
    g["baseline"] = g.groupby(key[0])["Times"].transform(lambda s: s.rolling(8, min_periods=3).median().shift(1))
    g["Overload"] = (g["Times"] / g["baseline"].replace(0, np.nan)).clip(0.2, 5.0).fillna(1.0)
    df = df.merge(g[key + ["Overload"]], on=key, how="left")
    df["Overload"] = df["Overload"].fillna(1.0)
    return df

df = build_overload(df)


In [4]:
# Cell 4 Cohort Filters
import numpy as np, pandas as pd

df = df.copy()
start, end = pd.Timestamp("2020-02-25"), pd.Timestamp("2020-09-21")
df = df[(df["DT_INTERNA"] >= start) & (df["DT_INTERNA"] <= end)]
if "PCR_SARS2" in df.columns: df = df[df["PCR_SARS2"] == 1]
if "HOSPITAL" in df.columns: df = df[df["HOSPITAL"] == 1]
df = df[df["EVOLUCAO"].isin([1,2])]
df["NU_IDADE_N"] = pd.to_numeric(df["NU_IDADE_N"], errors="coerce")
df = df[(df["NU_IDADE_N"] >= 0) & (df["NU_IDADE_N"] <= 110)]
y = (df["EVOLUCAO"] == 2).astype(int).values

print("N:", len(df), "| mortality rate:", y.mean().round(4))


N: 307075 | mortality rate: 0.3451


In [5]:
# Cell 5 Alias & Context Features (leakage-safe)
import numpy as np, pandas as pd

df = df.copy()
if "SEM_NOT" in df.columns:
    df["EPI_WEEK"] = pd.to_numeric(df["SEM_NOT"], errors="coerce")
else:
    df["EPI_WEEK"] = df["DT_INTERNA"].dt.isocalendar().week.astype("Int64")

df["_y"] = (df["EVOLUCAO"] == 2).astype(int)

def roll_feats(tbl, key_name):
    g = (tbl.groupby([key_name, "EPI_WEEK"])
           .agg(adm=("_y", "size"), death=("_y", "sum"))
           .reset_index()
           .sort_values([key_name, "EPI_WEEK"]))
    g["adm_4w"]   = g.groupby(key_name)["adm"].transform(lambda s: s.rolling(4, min_periods=1).sum().shift(1))
    g["death_4w"] = g.groupby(key_name)["death"].transform(lambda s: s.rolling(4, min_periods=1).sum().shift(1))
    g["cfr_4w"]   = (g["death_4w"] / g["adm_4w"].replace(0, np.nan)).fillna(0.0).clip(0, 0.6)
    g["adm_4w"]   = g["adm_4w"].fillna(0.0).clip(0, 500)
    return g[[key_name, "EPI_WEEK", "cfr_4w", "adm_4w"]]

has_hosp = ("CO_UNI_NOT" in df.columns) and df["CO_UNI_NOT"].notna().any()
if has_hosp:
    g_h = roll_feats(df, "CO_UNI_NOT").rename(columns={"cfr_4w": "HOSP_CFR_4w", "adm_4w": "HOSP_ADM_4w"})
    df = df.merge(g_h, on=["CO_UNI_NOT", "EPI_WEEK"], how="left")

g_s = roll_feats(df, "UF").rename(columns={"cfr_4w": "UF_CFR_4w", "adm_4w": "UF_ADM_4w"})
df = df.merge(g_s, on=["UF", "EPI_WEEK"], how="left")

def _safe_series(df_, col, default=np.nan):
    return df_[col] if col in df_.columns else pd.Series(default, index=df_.index)

h_cfr = _safe_series(df, "HOSP_CFR_4w", np.nan)
u_cfr = _safe_series(df, "UF_CFR_4w", np.nan)
df["CTX_CFR_4w"] = h_cfr.combine_first(u_cfr).fillna(0.0).clip(0, 0.6)

h_adm = _safe_series(df, "HOSP_ADM_4w", np.nan)
u_adm = _safe_series(df, "UF_ADM_4w", np.nan)
df["CTX_ADM_4w"] = h_adm.combine_first(u_adm).fillna(0.0).clip(0, 500)


In [6]:
# Cell 6 Assemble Feature Matrix
SYMPT  = ["FEBRE","TOSSE","GARGANTA","DISPNEIA","DESC_RESP","DIARREIA","VOMITO","DOR_ABD","FADIGA","PERD_OLFT","PERD_PALA","OUTRO_SIN"]
COMORB = ["CARDIOPATI","HEMATOLOGI","HEPATICA","ASMA","DIABETES","NEUROLOGIC","PNEUMOPATI","IMUNODEPRE","RENAL","OBESIDADE","TABAG"]
BIN_EXTRA = ["PUERPERA","CS_GESTANT"]

NUM_KEEP = ["NU_IDADE_N","AGE_Y","SATURACAO","OBES_IMC","SYMPT_CT","COMORB_CT","DAYS_ONSET_ADM","EPI_WEEK",
            "Overload","EDU_ORD","AGE_X_EDU","CTX_CFR_4w","CTX_ADM_4w"]
CAT_KEEP = ["CS_SEXO","CS_RACA","CS_ESCOL_N","UF","CO_UNI_NOT"]

def _exist(cols, df): return [c for c in cols if c in df.columns]

base_cols = _exist(["NU_IDADE_N","CS_SEXO","UF","CS_RACA"], df)
feature_cols = base_cols + _exist(SYMPT, df) + _exist(COMORB, df) + _exist(BIN_EXTRA, df) + _exist(NUM_KEEP, df) + _exist(CAT_KEEP, df)
seen=set(); feature_cols=[c for c in feature_cols if not (c in seen or seen.add(c))]

X = df[feature_cols].copy()
y = (df["EVOLUCAO"] == 2).astype(int).values

print("Rebuilt X before split:", X.shape, " | y rate:", float(y.mean()))


Rebuilt X before split: (307075, 32)  | y rate: 0.3451274118700643


In [7]:
# Cell 7 Manual Stratified Split on 6 Key Features
import numpy as np, pandas as pd

rng = np.random.RandomState(CFG.SEED)
dfS = pd.DataFrame(index=X.index)

dfS["AGE_BIN"] = pd.qcut(pd.to_numeric(X["NU_IDADE_N"], errors="coerce"), q=4, duplicates="drop").astype(str)
state_col = "UF"
if state_col not in X.columns: raise KeyError("UF col not found")
dfS["STATE"] = X[state_col].astype(str).fillna("UNK")

comorb_cols = ["CARDIOPATI","HEMATOLOGI","HEPATICA","ASMA","DIABETES","NEUROLOGIC","PNEUMOPATI","IMUNODEPRE","RENAL","OBESIDADE","TABAG"]
if "COMORB_CT" in X.columns:
    com_ct = pd.to_numeric(X["COMORB_CT"], errors="coerce")
else:
    cc = [c for c in comorb_cols if c in X.columns]
    B = X[cc].apply(pd.to_numeric, errors="coerce").replace({2:0,9:np.nan})
    com_ct = B.sum(axis=1)
dfS["COMORB_BIN"] = pd.cut(com_ct, bins=[-1,0,1,100], labels=["0","1","2+"])

edu = pd.to_numeric(df.get("CS_ESCOL_N"), errors="coerce") if "CS_ESCOL_N" in df.columns else pd.Series(np.nan, index=X.index)
dfS["EDU_BIN"] = edu.replace({9:np.nan}).fillna(-1).astype(int).astype(str)

eth_col = "CS_RACA" if "CS_RACA" in X.columns else None
dfS["ETH"] = X[eth_col].astype(str).replace({"9":"MISSING"}).fillna("MISSING") if eth_col else "NA"

dfS["SEX"] = X["CS_SEXO"].astype(str).replace({"9":"MISSING"}).fillna("MISSING") if "CS_SEXO" in X.columns else "NA"

dfS["STRATA_RAW"] = dfS[["AGE_BIN","STATE","COMORB_BIN","EDU_BIN","ETH","SEX"]].astype(str).agg("|".join, axis=1)
ct = dfS["STRATA_RAW"].value_counts()
MIN_PER = 25
rare = set(ct[ct < MIN_PER].index)
dfS["STRATA"] = np.where(dfS["STRATA_RAW"].isin(rare), "RARE", dfS["STRATA_RAW"])

def stratified_split(X, y, strata, test_size=0.2, valid_size=0.2, seed=42):
    rng = np.random.RandomState(seed)
    df_idx = pd.DataFrame({"idx": np.arange(len(X)), "strata": strata})
    test_idx, valid_idx, train_idx = [], [], []
    for s, sub in df_idx.groupby("strata"):
        ids = sub["idx"].values
        rng.shuffle(ids)
        n = len(ids)
        n_test  = int(round(n * test_size))
        n_valid = int(round((n - n_test) * valid_size))
        test_idx.extend(ids[:n_test])
        valid_idx.extend(ids[n_test:n_test+n_valid])
        train_idx.extend(ids[n_test+n_valid:])
    return (X.iloc[train_idx], X.iloc[valid_idx], X.iloc[test_idx],
            y[train_idx], y[valid_idx], y[test_idx])

Xtr, Xva, Xte, ytr, yva, yte = stratified_split(X, y, dfS["STRATA"], test_size=0.20, valid_size=0.20, seed=CFG.SEED)

print("training set:", len(Xtr), "val set:", len(Xva), "test set:", len(Xte))
print("mortality rate:", ytr.mean().round(4), yva.mean().round(4), yte.mean().round(4))

def smd_num(a,b):
    a = pd.to_numeric(pd.Series(a), errors="coerce").dropna().to_numpy()
    b = pd.to_numeric(pd.Series(b), errors="coerce").dropna().to_numpy()
    if len(a)<2 or len(b)<2: return np.nan
    m1,m2=a.mean(),b.mean(); s1,s2=a.std(ddof=1),b.std(ddof=1); sp=np.sqrt((s1**2+s2**2)/2)
    return np.nan if sp==0 else (m1-m2)/sp
def smd_cat(s1,s2):
    v1=pd.Series(s1).astype(str).fillna("MISSING"); v2=pd.Series(s2).astype(str).fillna("MISSING")
    levels=set(v1.unique()).union(v2.unique()); worst=0.0
    for lev in levels:
        p1=(v1==lev).mean(); p2=(v2==lev).mean(); p=(p1+p2)/2; denom=np.sqrt(p*(1-p)) if 0<p<1 else np.nan
        smd=abs(p1-p2)/denom if denom and not np.isnan(denom) else 0.0; worst=max(worst,smd)
    return worst
def max_smd(A,B):
    vals=[abs(smd_num(A["NU_IDADE_N"],B["NU_IDADE_N"])), smd_cat(A[state_col],B[state_col])]
    if "CS_SEXO" in A.columns: vals.append(smd_cat(A["CS_SEXO"],B["CS_SEXO"]))
    if "CS_RACA" in A.columns: vals.append(smd_cat(A["CS_RACA"],B["CS_RACA"]))
    return np.nanmax([v for v in vals if pd.notna(v)])
print("max SMD(Train vs Test) =", round(max_smd(Xtr,Xte),3), " | (Train vs Valid) =", round(max_smd(Xtr,Xva),3))


training set: 196576 val set: 49109 test set: 61390
mortality rate: 0.3468 0.3427 0.3416
max SMD(Train vs Test) = 0.006  | (Train vs Valid) = 0.007


In [8]:
# Cell 8 Feature Engineering
import pandas as pd, numpy as np

Xtr, Xva, Xte = Xtr.copy(), Xva.copy(), Xte.copy()
SYMPT  = ["FEBRE","TOSSE","GARGANTA","DISPNEIA","DESC_RESP","DIARREIA","VOMITO","DOR_ABD","FADIGA","PERD_OLFT","PERD_PALA","OUTRO_SIN"]
COMORB = ["CARDIOPATI","HEMATOLOGI","HEPATICA","ASMA","DIABETES","NEUROLOGIC","PNEUMOPATI","IMUNODEPRE","RENAL","OBESIDADE","TABAG"]
CAT_CAND = ["CS_SEXO","CS_RACA","CS_ESCOL_N","UF","CO_UNI_NOT"]
NUM_CAND = ["AGE_Y","NU_IDADE_N","SATURACAO","OBES_IMC","SYMPT_CT","COMORB_CT","DAYS_ONSET_ADM","EPI_WEEK","Overload","EDU_ORD","AGE_X_EDU","CTX_CFR_4w","CTX_ADM_4w"]
BIN_EXTRA = ["VAX_ANY","VAX_BOOST","PUERPERA","CS_GESTANT"]

def _to_dt(df, cols):
    for c in cols:
        if c in df.columns: df[c]=pd.to_datetime(df[c], errors="coerce", dayfirst=True)
def _yesno(df, cols):
    for c in cols:
        if c in df.columns:
            s=df[c]; s=s.where(~s.isin([9,"9",0,"0","",None]), np.nan)
            s=s.where(~s.isin([2,"2"]), 0); s=s.where(~s.isin([1,"1"]), 1)
            df[c]=pd.to_numeric(s, errors="coerce")
def _exist(cols, df): return [c for c in cols if c in df.columns]
def _mk_age(df):
    if "AGE_Y" not in df.columns:
        if "NU_IDADE_N" in df.columns:
            df["AGE_Y"]=pd.to_numeric(df["NU_IDADE_N"], errors="coerce")
def _mk_counts(df):
    sc=_exist(SYMPT,df); cc=_exist(COMORB,df)
    if sc: df["SYMPT_CT"]=(df[sc]==1).sum(axis=1)
    if cc: df["COMORB_CT"]=(df[cc]==1).sum(axis=1)
def _mk_onset(df):
    if {"DT_SIN_PRI","DT_INTERNA"}.issubset(df.columns):
        d=(df["DT_INTERNA"]-df["DT_SIN_PRI"]).dt.days; df["DAYS_ONSET_ADM"]=d.clip(-1,60)
def _mk_epi(df):
    if "EPI_WEEK" not in df.columns:
        df["EPI_WEEK"]=pd.to_numeric(df.get("EPI_WEEK"), errors="coerce")
def _mk_edu(df):
    if "CS_ESCOL_N" in df.columns:
        df["EDU_ORD"]=pd.to_numeric(df["CS_ESCOL_N"], errors="coerce")
        if "AGE_Y" in df.columns: df["AGE_X_EDU"]=df["AGE_Y"]*df["EDU_ORD"]

for d in (Xtr,Xva,Xte):
    _to_dt(d, ["DT_SIN_PRI","DT_INTERNA","DT_NASC"])
    for c in ["SATURACAO","OBES_IMC","NU_IDADE_N","Overload","CTX_CFR_4w","CTX_ADM_4w"]:
        if c in d.columns: d[c]=pd.to_numeric(d[c], errors="coerce")
for d in (Xtr,Xva,Xte):
    _yesno(d, SYMPT+COMORB+["PUERPERA","CS_GESTANT"])
    _mk_age(d); _mk_counts(d); _mk_onset(d); _mk_epi(d); _mk_edu(d)

num_cols = _exist(NUM_CAND, Xtr)
bin_cols = _exist(SYMPT+COMORB+BIN_EXTRA, Xtr)
cat_cols = _exist(CAT_CAND, Xtr)
cat_cols = [c for c in cat_cols if Xtr[c].nunique(dropna=True) <= 80]

feature_cols = list(dict.fromkeys(num_cols + bin_cols + cat_cols)) 
print(f"[FE] num={len(num_cols)}, bin={len(bin_cols)}, cat={len(cat_cols)}, total={len(feature_cols)}")


[FE] num=10, bin=23, cat=4, total=37


In [9]:
# Cell 9 Preprocessing (Imputation, Scaling, OHE)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median", add_indicator=True)), ("scaler", StandardScaler())])
bin_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent"))])

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=10)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", ohe)])

preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("bin", bin_pipe, bin_cols),
    ("cat", cat_pipe, cat_cols),
], remainder="drop")

Xtr_t = preproc.fit_transform(Xtr[feature_cols])
Xva_t = preproc.transform(Xva[feature_cols])
Xte_t = preproc.transform(Xte[feature_cols])

try:
    cat_names = list(preproc.named_transformers_["cat"].named_steps["ohe"].get_feature_names_out(cat_cols))
except Exception:
    cat_names = []


In [10]:
# Cell 10 Quick Check
def coverage(df_in, cols):
    rows=[]
    for c in cols:
        if c in df_in.columns:
            s=df_in[c]; nn=s.notna().mean(); uniq=s.nunique(dropna=True)
            rows.append([c, nn, uniq, np.nan if s.dropna().empty else float(np.nanmean(pd.to_numeric(s, errors="coerce")))])
        else:
            rows.append([c, 0.0, 0, np.nan])
    return pd.DataFrame(rows, columns=["feature","non_null_rate","n_unique","mean_like"]).sort_values("non_null_rate")

key_cols = ["Overload","EDU_ORD","AGE_X_EDU","SYMPT_CT","COMORB_CT","CTX_CFR_4w","CTX_ADM_4w","EPI_WEEK"]
print("Train coverage:"); display(coverage(Xtr, key_cols))
print("Valid coverage:"); display(coverage(Xva, key_cols))
print("Test  coverage:"); display(coverage(Xte, key_cols))


Train coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
1,EDU_ORD,0.72049,7,5.682301
2,AGE_X_EDU,0.72049,325,335.722285
0,Overload,1.0,858,1.684532
3,SYMPT_CT,1.0,13,3.57097
4,COMORB_CT,1.0,11,0.876897
5,CTX_CFR_4w,1.0,747,0.343252
6,CTX_ADM_4w,1.0,278,477.566366
7,EPI_WEEK,1.0,53,26.446407


Valid coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
1,EDU_ORD,0.717404,7,5.66609
2,AGE_X_EDU,0.717404,311,334.476654
0,Overload,1.0,733,1.682816
3,SYMPT_CT,1.0,13,3.570893
4,COMORB_CT,1.0,11,0.87766
5,CTX_CFR_4w,1.0,696,0.343145
6,CTX_ADM_4w,1.0,263,477.924657
7,EPI_WEEK,1.0,52,26.462461


Test  coverage:


Unnamed: 0,feature,non_null_rate,n_unique,mean_like
1,EDU_ORD,0.722658,7,5.691056
2,AGE_X_EDU,0.722658,313,336.282887
0,Overload,1.0,777,1.680717
3,SYMPT_CT,1.0,13,3.559912
4,COMORB_CT,1.0,11,0.877211
5,CTX_CFR_4w,1.0,709,0.343058
6,CTX_ADM_4w,1.0,262,477.162437
7,EPI_WEEK,1.0,53,26.461443


In [11]:
# Persist Artifacts
meta = {
    "feature_cols": feature_cols,
    "num_cols": num_cols,
    "bin_cols": bin_cols,
    "cat_cols": cat_cols,
    "cat_names": cat_names,
    "seed": CFG.SEED,
}

joblib.dump(Xtr, INT_DIR/"Xtr.pkl"); joblib.dump(Xva, INT_DIR/"Xva.pkl"); joblib.dump(Xte, INT_DIR/"Xte.pkl")
joblib.dump(ytr, INT_DIR/"ytr.npy"); joblib.dump(yva, INT_DIR/"yva.npy"); joblib.dump(yte, INT_DIR/"yte.npy")

joblib.dump(Xtr_t, INT_DIR/"Xtr_t.npy"); joblib.dump(Xva_t, INT_DIR/"Xva_t.npy"); joblib.dump(Xte_t, INT_DIR/"Xte_t.npy")

joblib.dump(preproc, INT_DIR/"preproc.joblib")
with open(INT_DIR/"meta.json","w",encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

df_min = df.copy()
df_min.to_parquet(INT_DIR/"df_filtered.parquet", index=False)
print("Artifacts saved to:", INT_DIR)


Artifacts saved to: C:\Users\Nicee\Desktop\kenkyu\gnamboost_outputs\interim
