## Mortality Data Cleaning

Process CDC mortality data (2003-2015) and merge with population estimates.

### Import Libraries

In [18]:
import requests
import zipfile
import io
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

### Load Mortality Data

In [19]:
url = "https://www.dropbox.com/scl/fi/bnkoej224ve1tr35fhek8/US_VitalStatistics.zip?rlkey=oenpdsvsiovlqw7v7j1yhldye&dl=1"

resp = requests.get(url)
resp.raise_for_status()
zip_bytes = io.BytesIO(resp.content)

dfs = []

with zipfile.ZipFile(zip_bytes, "r") as zf:
    txt_files = [
        name
        for name in zf.namelist()
        if name.lower().endswith(".txt")
        and "__macosx" not in name.lower()
        and "/._" not in name
    ]

    for name in sorted(txt_files):
        print("Reading:", name)
        with zf.open(name) as f:
            df = pd.read_csv(f, sep="\t", encoding="latin1")
            dfs.append(df)

mortality_03_15 = pd.concat(dfs, ignore_index=True)

print("Number of files read:", len(dfs))
print("Final dataframe shape:", mortality_03_15.shape)
mortality_03_15.sample(20)

Reading: Underlying Cause of Death, 2003.txt
Reading: Underlying Cause of Death, 2004.txt
Reading: Underlying Cause of Death, 2005.txt
Reading: Underlying Cause of Death, 2006.txt
Reading: Underlying Cause of Death, 2007.txt
Reading: Underlying Cause of Death, 2008.txt
Reading: Underlying Cause of Death, 2009.txt
Reading: Underlying Cause of Death, 2010.txt
Reading: Underlying Cause of Death, 2011.txt
Reading: Underlying Cause of Death, 2012.txt
Reading: Underlying Cause of Death, 2013.txt
Reading: Underlying Cause of Death, 2014.txt
Reading: Underlying Cause of Death, 2015.txt
Number of files read: 13
Final dataframe shape: (57436, 8)


Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
5038,,"Franklin County, IL",17055.0,2004.0,2004.0,All other non-drug and non-alcohol causes,O9,556.0
32584,,"Gaston County, NC",37071.0,2010.0,2010.0,All other alcohol-induced causes,A9,19.0
34266,,"Uinta County, WY",56041.0,2010.0,2010.0,All other non-drug and non-alcohol causes,O9,126.0
2234,,"Clark County, NV",32003.0,2003.0,2003.0,All other alcohol-induced causes,A9,121.0
10164,,"Polk County, MN",27119.0,2005.0,2005.0,All other non-drug and non-alcohol causes,O9,342.0
32254,,"Douglas County, NV",32005.0,2010.0,2010.0,Drug poisonings (overdose) Unintentional (X40-...,D1,11.0
43165,,"Hancock County, WV",54029.0,2012.0,2012.0,All other non-drug and non-alcohol causes,O9,427.0
41211,,"Fillmore County, NE",31059.0,2012.0,2012.0,All other non-drug and non-alcohol causes,O9,80.0
17316,,"Duval County, FL",12031.0,2007.0,2007.0,All other non-drug and non-alcohol causes,O9,6964.0
2590,,"Richmond County, NC",37153.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,527.0


### Initial Data Exploration

In [20]:
mortality_03_15.columns

Index(['Notes', 'County', 'County Code', 'Year', 'Year Code',
       'Drug/Alcohol Induced Cause', 'Drug/Alcohol Induced Cause Code',
       'Deaths'],
      dtype='object')

In [21]:
mortality_03_15.isna().sum()

Notes                              57241
County                               195
County Code                          195
Year                                 195
Year Code                            195
Drug/Alcohol Induced Cause           195
Drug/Alcohol Induced Cause Code      195
Deaths                               195
dtype: int64

In [22]:
mortality_03_15.duplicated().sum()

np.int64(183)

In [23]:
mortality_03_15[mortality_03_15.duplicated()]

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
4094,---,,,,,,,
4096,---,,,,,,,
4101,---,,,,,,,
8237,---,,,,,,,
8238,"Dataset: Underlying Cause of Death, 1999-2017",,,,,,,
...,...,...,...,...,...,...,...,...
57431,Suggested Citation: Centers for Disease Contro...,,,,,,,
57432,"1999-2017 on CDC WONDER Online Database, relea...",,,,,,,
57433,compiled from data provided by the 57 vital st...,,,,,,,
57434,at http://wonder.cdc.gov/ucd-icd10.html on Oct...,,,,,,,


In [24]:
mortality_03_15_clean = mortality_03_15.copy()

### Handle Missing Values

In [25]:
mortality_03_15_clean.isna().sum()

Notes                              57241
County                               195
County Code                          195
Year                                 195
Year Code                            195
Drug/Alcohol Induced Cause           195
Drug/Alcohol Induced Cause Code      195
Deaths                               195
dtype: int64

In [26]:
for col in mortality_03_15_clean.columns:
    uniques = mortality_03_15_clean[col].astype(str).unique()
    unusual = [
        u
        for u in uniques
        if u.strip().lower()
        in ["missing", "n/a", "na", "none", ".", "null", "suppressed", ""]
    ]
    if unusual:
        print(f"{col}: {unusual}")

Deaths: ['Missing']


In [27]:
mortality_03_15_clean[mortality_03_15_clean["Deaths"] == "Missing"]

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
52756,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,Drug poisonings (overdose) Unintentional (X40-...,D1,Missing
52757,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,Drug poisonings (overdose) Suicide (X60-X64),D2,Missing
52758,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,Drug poisonings (overdose) Homicide (X85),D3,Missing
52759,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,Drug poisonings (overdose) Undetermined (Y10-Y14),D4,Missing
52760,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,All other drug-induced causes,D9,Missing
52761,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,"Alcohol poisonings (overdose) (X45, X65, Y15)",A1,Missing
52762,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,All other alcohol-induced causes,A9,Missing
52763,,"Prince of Wales-Outer Ketchikan Census Area, AK",2201.0,2015.0,2015.0,All other non-drug and non-alcohol causes,O9,Missing
52765,,"Skagway-Hoonah-Angoon Census Area, AK",2232.0,2015.0,2015.0,Drug poisonings (overdose) Unintentional (X40-...,D1,Missing
52766,,"Skagway-Hoonah-Angoon Census Area, AK",2232.0,2015.0,2015.0,Drug poisonings (overdose) Suicide (X60-X64),D2,Missing


In [28]:
mortality_03_15_clean["Deaths"] = mortality_03_15_clean["Deaths"].replace(
    "Missing", np.nan
)

### Load Population Data

In [29]:
population = pd.read_csv("../01_data/clean/population_2000_2024.csv")
print(f"Population data shape: {population.shape}")

Population data shape: (14976, 7)


### Standardize FIPS Codes

In [30]:
mortality_03_15_clean["County Code"] = (
    mortality_03_15_clean["County Code"].astype(str).str.zfill(5)
)
population["fips"] = population["fips"].astype(str).str.zfill(5)

### Filter Data

In [31]:
obsolete_fips = ["02201", "02232", "02280", "02270", "46113", "51515", "51560"]

mortality_03_15_clean = mortality_03_15_clean[
    ~mortality_03_15_clean["County Code"].isin(obsolete_fips)
]

mortality_03_15_clean = mortality_03_15_clean[
    mortality_03_15_clean["Drug/Alcohol Induced Cause Code"] == "D1"
]

print(f"Filtered mortality data shape: {mortality_03_15_clean.shape}")

Filtered mortality data shape: (7573, 8)


### Merge Datasets

In [None]:
mortality_03_15_clean["Year"] = mortality_03_15_clean["Year"].astype(int)
mortality_03_15_clean["County Code"] = mortality_03_15_clean["County Code"].str.replace(
    ".0", "", regex=False
)

pop_mortality_merged = pd.merge(
    mortality_03_15_clean,
    population,
    left_on=["County Code", "Year"],
    right_on=["fips", "year"],
    how="inner",
)

print(f"Merged data shape: {pop_mortality_merged.shape}")

Merged data shape: (1916, 15)


### Export Cleaned Data

In [36]:
pop_mortality_merged.to_csv(
    "../01_data/clean/merged_mortality_population.csv", index=False
)
print("\nExported to: ../01_data/clean/merged_mortality_population.csv")


Exported to: ../01_data/clean/merged_mortality_population.csv
