# Train Test Validation Split


### Importing the big volume of data

In [1]:
import pandas as pd
df = pd.read_csv('https://media.githubusercontent.com/media/JuneWayne/Newborn-Baby-Health-Monitoring/refs/heads/main/Data/linkco2013us_den.csv', low_memory=False)
df

Unnamed: 0,revision,laterec,idnumber,dob_yy,dob_mm,dob_wk,ostate,ocntyfips,ocntypop,bfacil,...,aged,ager5,ager22,manner,dispo,autopsy,place,ucod,ucod130,recwt
0,A,0,,2013,1,3,,,,1.0,...,,,,,,,,,,1.0
1,A,0,,2013,1,3,,,,1.0,...,,,,,,,,,,1.0
2,A,0,,2013,1,4,,,,1.0,...,,,,,,,,,,1.0
3,A,0,,2013,1,3,,,,1.0,...,,,,,,,,,,1.0
4,A,0,,2013,1,3,,,,1.0,...,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940759,A,0,,2013,10,6,,,,1.0,...,,,,,,,,,,1.0
3940760,A,0,,2013,12,2,,,,1.0,...,,,,,,,,,,1.0
3940761,A,0,,2013,12,6,,,,1.0,...,,,,,,,,,,1.0
3940762,A,0,,2013,10,3,,,,1.0,...,,,,,,,,,,1.0


### Variables used from NVSS linked birth–infant death data

| Variable   | Description |
|-----------|-------------|
| `matchs`      | Match status between birth and infant death records; used to derive the binary outcome (1 = linked to infant death <1 year, 2 = no linked infant death). |
| `mager14`     | Mother’s age at delivery, recoded into 14 ordered age groups (captures non-linear age-related risk). |
| `mar`         | Mother’s marital status at the time of birth (e.g., married vs unmarried). |
| `meduc`       | Mother’s highest level of education reported on the birth record. |
| `mbrace`      | Mother’s bridged race category (harmonized race classification across reporting systems). |
| `umhisp`      | Mother’s Hispanic origin (whether the mother is of Hispanic/Latina ethnicity). |
| `lbo`         | Live birth order of the infant (first, second, third live birth, etc.). |
| `tbo`         | Total birth order, counting all previous live births and fetal deaths. |
| `precare_rec` | Recode of the month in pregnancy when prenatal care began (e.g., first trimester vs later or none). |
| `previs_rec`  | Recode of the total number of prenatal care visits during pregnancy. |
| `wtgain_rec`  | Recode of maternal gestational weight gain (e.g., inadequate, adequate, excessive). |
| `urf_diab`    | Indicator that the mother had diabetes (pre-existing or gestational) during this pregnancy. |
| `urf_chyper`  | Indicator that the mother had chronic hypertension prior to or early in pregnancy. |
| `urf_phyper`  | Indicator of pregnancy-associated (gestational) hypertension. |
| `urf_eclam`   | Indicator that the mother experienced eclampsia during this pregnancy. |
| `rf_ppterm`   | History of previous preterm birth in earlier pregnancies. |
| `rf_ppoutc`   | History of previous poor pregnancy outcome (e.g., fetal or neonatal death, very small infant). |
| `rf_cesar`    | Indicator that the mother has had at least one prior cesarean delivery. |
| `rf_ncesar`   | Number of previous cesarean deliveries. |
| `tobuse`      | Indicator of any tobacco use during the current pregnancy. |
| `cig_rec6`    | Six-category recode of number of cigarettes smoked per day during pregnancy (intensity of smoking). |
| `sex`         | Infant’s sex (male or female). |
| `combgest`    | Gestational age in completed weeks, combining clinical estimate and last menstrual period information. |
| `gestrec10`   | Ten-category recode of gestational age (e.g., extremely preterm, very preterm, term, post-term). |
| `dbwt`        | Infant birth weight in grams (continuous measure). |
| `bwtr14`      | Fourteen-category recode of birth weight (captures low, very low, and high birth weight). |
| `apgar5r`     | Recode of the 5-minute Apgar score summarizing newborn condition shortly after birth. |
| `dplural`     | Plurality of the pregnancy (singleton, twin, triplet, etc.). |
| `ab_vent`     | Indicator that the infant received any assisted ventilation after birth. |
| `ab_vent6`    | Indicator that assisted ventilation lasted more than six hours. |
| `ab_nicu`     | Indicator that the infant was admitted to a neonatal intensive care unit (NICU). |
| `ca_anen`     | Indicator that the infant had anencephaly (severe neural tube defect). |
| `ca_menin`    | Indicator of meningomyelocele or spina bifida. |
| `ca_heart`    | Indicator of cyanotic congenital heart disease. |
| `ca_ompha`    | Indicator of omphalocele (abdominal wall defect with herniated organs). |
| `ca_gastro`   | Indicator of gastroschisis (abdominal wall defect with exposed intestines). |
| `ca_limb`     | Indicator of limb reduction defect (missing or underdeveloped limbs). |
| `ca_cleftlp`  | Indicator of cleft lip with or without cleft palate. |
| `ca_cleft`    | Indicator of cleft palate alone (without cleft lip). |
| `ca_downs`    | Indicator that the infant had Down syndrome (trisomy 21). |
| `ca_chrom`    | Indicator of other suspected chromosomal disorder (excluding classic Down syndrome). |


In [2]:
import pandas as pd

# features identified that may be useful
usecols = [
    # outcome / linkage
    "matchs",        # match status (1=death<1yr, 2=alive at 1yr)

    # maternal demographics
    "mager14",       # mother's age (14-category recode)
    "mar",           # mother's marital status
    "meduc",         # mother's education (detail)
    "mbrace",        # mother's bridged race
    "umhisp",        # mother's Hispanic origin

    # obstetric history & prenatal care
    "lbo",           # live birth order
    "tbo",           # total birth order
    "precare_rec",   # month prenatal care began (recode)
    "previs_rec",    # number of prenatal visits (recode)
    "wtgain_rec",    # gestational weight gain (recode)

    # maternal health risks
    "urf_diab",      # diabetes (unrevised)
    "urf_chyper",    # chronic hypertension
    "urf_phyper",    # pregnancy-associated hypertension
    "urf_eclam",     # eclampsia
    "rf_ppterm",     # history of preterm birth
    "rf_ppoutc",     # history of poor pregnancy outcome
    "rf_cesar",      # any previous cesarean
    "rf_ncesar",     # number of previous cesareans

    # smoking & tobacco
    "tobuse",        # any tobacco use during pregnancy
    "cig_rec6",      # cigarettes per day (6-level recode)

    # infant characteristics at birth
    "sex",           # infant sex
    "combgest",      # gestational age in weeks (combined)
    "gestrec10",     # gestational age (10-category recode)
    "dbwt",          # birth weight in grams
    "bwtr14",        # birth weight (14-category recode)
    "apgar5r",       # 5-minute Apgar score (recode)
    "dplural",       # plurality (singleton, twin, etc.)

    # neonatal interventions / complications
    "ab_vent",       # any assisted ventilation
    "ab_vent6",      # assisted ventilation > 6 hours
    "ab_nicu",       # admission to NICU

    # congenital anomalies
    "ca_anen",       # anencephaly
    "ca_menin",      # meningomyelocele / spina bifida
    "ca_heart",      # cyanotic congenital heart disease
    "ca_ompha",      # omphalocele
    "ca_gastro",     # gastroschisis
    "ca_limb",       # limb reduction defect
    "ca_cleftlp",    # cleft lip with/without cleft palate
    "ca_cleft",      # cleft palate alone
    "ca_downs",      # Down syndrome
    "ca_chrom",      # other suspected chromosomal disorder
]

existing_cols = [c for c in usecols if c in df.columns]
df_filtered = df[existing_cols].copy()
df_filtered.head()

Unnamed: 0,matchs,mager14,mar,meduc,mbrace,umhisp,lbo,tbo,precare_rec,previs_rec,...,ca_anen,ca_menin,ca_heart,ca_ompha,ca_gastro,ca_limb,ca_cleftlp,ca_cleft,ca_downs,ca_chrom
0,2,8,1,3.0,1.0,1,2,9,1.0,5,...,N,N,N,N,N,N,N,N,N,N
1,2,9,1,4.0,21.0,0,4,5,1.0,5,...,N,N,N,N,N,N,N,N,N,N
2,2,9,1,6.0,1.0,0,1,1,1.0,7,...,N,N,N,N,N,N,N,N,N,N
3,2,10,1,6.0,1.0,0,3,5,2.0,5,...,N,N,N,N,N,N,N,N,N,N
4,2,8,2,2.0,3.0,0,1,1,1.0,9,...,N,N,N,N,N,N,N,N,N,N


In [3]:
df_filtered["matchs"] = pd.to_numeric(df_filtered["matchs"], errors="coerce")
print(df["matchs"].value_counts(dropna=False))

matchs
2    3917605
1      23159
Name: count, dtype: int64


### Dropping features that may lead to a label leak (i.e. cause of death)
- dont want the model to learn that certain features are tied to infant mortality for certain

In [5]:
df_filtered = df_filtered[df["matchs"].isin([1, 2])]
df_filtered["infant_death"] = (df_filtered["matchs"] == 1).astype(int)

# Just in case if some of the columns identified below still exist in our dataset
must_drop_columns = [
    # death timing / age
    "aged",
    "ager5",
    "ager22",
    # circumstances of death
    "manner",
    "dispo",
    "autopsy",
    "place",
    # cause of death codes
    "ucod",
    "ucod130",
    # infant id
    "idnumber",
    "recwt",
]

df_filtered = df_filtered.drop(columns=[c for c in must_drop_columns if c in df_filtered.columns], inplace=False)

# Drop columns with more than 80% missing data
n_rows = len(df_filtered)
missing_frac = df_filtered.isnull().mean()
missing_data_columns = missing_frac[missing_frac > 0.8].index
df_filtered = df_filtered.drop(columns=missing_data_columns)
display(df_filtered)

# calculate how many columns were dropped
print(f"Dropped {len(missing_data_columns)} columns")

Unnamed: 0,matchs,mager14,mar,meduc,mbrace,umhisp,lbo,tbo,precare_rec,previs_rec,...,ca_menin,ca_heart,ca_ompha,ca_gastro,ca_limb,ca_cleftlp,ca_cleft,ca_downs,ca_chrom,infant_death
0,2,8,1,3.0,1.0,1,2,9,1.0,5,...,N,N,N,N,N,N,N,N,N,0
1,2,9,1,4.0,21.0,0,4,5,1.0,5,...,N,N,N,N,N,N,N,N,N,0
2,2,9,1,6.0,1.0,0,1,1,1.0,7,...,N,N,N,N,N,N,N,N,N,0
3,2,10,1,6.0,1.0,0,3,5,2.0,5,...,N,N,N,N,N,N,N,N,N,0
4,2,8,2,2.0,3.0,0,1,1,1.0,9,...,N,N,N,N,N,N,N,N,N,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940759,2,9,2,6.0,2.0,0,1,3,3.0,5,...,N,N,N,N,N,N,N,N,N,0
3940760,2,9,1,3.0,2.0,2,1,1,1.0,6,...,N,N,N,N,N,N,N,N,N,0
3940761,2,11,1,4.0,10.0,0,1,1,2.0,6,...,N,N,N,N,N,N,N,N,N,0
3940762,2,10,2,6.0,1.0,0,1,1,1.0,7,...,N,N,N,N,N,N,N,N,N,0


Dropped 2 columns


### Drop all NaN values (imputation would likely undermine data quality)

In [6]:
df_filtered = df_filtered.dropna().reset_index(drop=True)
df_filtered.shape
print(df_filtered.isnull().sum().sum())
display(df_filtered)

0


Unnamed: 0,matchs,mager14,mar,meduc,mbrace,umhisp,lbo,tbo,precare_rec,previs_rec,...,ca_menin,ca_heart,ca_ompha,ca_gastro,ca_limb,ca_cleftlp,ca_cleft,ca_downs,ca_chrom,infant_death
0,2,8,1,3.0,1.0,1,2,9,1.0,5,...,N,N,N,N,N,N,N,N,N,0
1,2,9,1,4.0,21.0,0,4,5,1.0,5,...,N,N,N,N,N,N,N,N,N,0
2,2,9,1,6.0,1.0,0,1,1,1.0,7,...,N,N,N,N,N,N,N,N,N,0
3,2,10,1,6.0,1.0,0,3,5,2.0,5,...,N,N,N,N,N,N,N,N,N,0
4,2,8,2,2.0,3.0,0,1,1,1.0,9,...,N,N,N,N,N,N,N,N,N,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3564412,2,9,2,6.0,2.0,0,1,3,3.0,5,...,N,N,N,N,N,N,N,N,N,0
3564413,2,9,1,3.0,2.0,2,1,1,1.0,6,...,N,N,N,N,N,N,N,N,N,0
3564414,2,11,1,4.0,10.0,0,1,1,2.0,6,...,N,N,N,N,N,N,N,N,N,0
3564415,2,10,2,6.0,1.0,0,1,1,1.0,7,...,N,N,N,N,N,N,N,N,N,0


### Prepare features and target variable

In [9]:
y = df_filtered["infant_death"]
X = df_filtered.drop(columns=["infant_death","matchs"])

### Train test Validation

In [10]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% train+val cases to 20% test cases
X_main, X_test, y_main, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Split the remaining dataset to 75% train cases and 25% validation cases
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.25, random_state=42, stratify=y_main)


### output as csv

In [11]:
train_df = X_train.copy()
train_df["infant_death"] = y_train.values

val_df = X_val.copy()
val_df["infant_death"] = y_val.values

test_df = X_test.copy()
test_df["infant_death"] = y_test.values

aggregated_df = df_filtered.copy()
aggregated_df["infant_death"] = y.values

train_df.to_csv("nvss_train.csv", index=False)
val_df.to_csv("nvss_val.csv", index=False)
test_df.to_csv("nvss_test.csv", index=False)
aggregated_df.to_csv("nvss_aggregated.csv", index=False)
