## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

## Load and Preview Raw Data

In [2]:
# Define the folder path
folder_path = "~/Desktop/US_VitalStatistics/"

# Load and combine yearly data files into a single dataframe
raw_data = pd.concat(
    [
        pd.read_csv(
            f"{folder_path}Underlying Cause of Death, {year}.txt", sep="\t", dtype=str
        )
        for year in range(2003, 2016)
    ],
    ignore_index=True,
)

raw_data.head()

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0
1,,"Baldwin County, AL",1003,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0
2,,"Baldwin County, AL",1003,2003.0,2003.0,All other alcohol-induced causes,A9,14.0
3,,"Baldwin County, AL",1003,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0
4,,"Barbour County, AL",1005,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0


## Clean and Rename Columns

In [3]:
# Drop unnecessary columns
raw_data.drop(
    columns=["Notes", "Year Code", "Drug/Alcohol Induced Cause Code"],
    inplace=True,
)

# Rename columns for consistency
cleaned_data = raw_data.rename(
    columns={
        "County": "county",
        "County Code": "county_code",
        "Year": "year",
        "Drug/Alcohol Induced Cause": "cause",
        "Deaths": "deaths",
    }
)

cleaned_data.head()

Unnamed: 0,county,county_code,year,cause,deaths
0,"Autauga County, AL",1001,2003.0,All other non-drug and non-alcohol causes,397.0
1,"Baldwin County, AL",1003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,10.0
2,"Baldwin County, AL",1003,2003.0,All other alcohol-induced causes,14.0
3,"Baldwin County, AL",1003,2003.0,All other non-drug and non-alcohol causes,1479.0
4,"Barbour County, AL",1005,2003.0,All other non-drug and non-alcohol causes,287.0


## Filter for Drug-Related Causes

In [4]:
# List unique values in 'cause' to identify categories
unique_causes = cleaned_data["cause"].unique()
print(f"Unique values in 'cause':\n{unique_causes}")

Unique values in 'cause':
['All other non-drug and non-alcohol causes'
 'Drug poisonings (overdose) Unintentional (X40-X44)'
 'All other alcohol-induced causes' 'All other drug-induced causes'
 'Drug poisonings (overdose) Suicide (X60-X64)'
 'Drug poisonings (overdose) Undetermined (Y10-Y14)'
 'Alcohol poisonings (overdose) (X45, X65, Y15)' nan
 'Drug poisonings (overdose) Homicide (X85)']


In [5]:
# Filter for main cause: unintentional drug overdose
filtered_data = cleaned_data[
    cleaned_data["cause"] == "Drug poisonings (overdose) Unintentional (X40-X44)"
]
filtered_data.head()

Unnamed: 0,county,county_code,year,cause,deaths
1,"Baldwin County, AL",1003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,10.0
38,"Jefferson County, AL",1073,2003.0,Drug poisonings (overdose) Unintentional (X40-...,37.0
54,"Mobile County, AL",1097,2003.0,Drug poisonings (overdose) Unintentional (X40-...,26.0
78,"Anchorage Borough, AK",2020,2003.0,Drug poisonings (overdose) Unintentional (X40-...,31.0
91,"Matanuska-Susitna Borough, AK",2170,2003.0,Drug poisonings (overdose) Unintentional (X40-...,11.0


## Filter for Target States

In [6]:
# Add 'state' column
filtered_data["state"] = filtered_data["county"].str.split(", ").str[-1]

# Filter for specific states
filtered_data = filtered_data[
    filtered_data["state"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])
]

filtered_data.head()

Unnamed: 0,county,county_code,year,cause,deaths,state
1,"Baldwin County, AL",1003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,10.0,AL
38,"Jefferson County, AL",1073,2003.0,Drug poisonings (overdose) Unintentional (X40-...,37.0,AL
54,"Mobile County, AL",1097,2003.0,Drug poisonings (overdose) Unintentional (X40-...,26.0,AL
378,"Adams County, CO",8001,2003.0,Drug poisonings (overdose) Unintentional (X40-...,32.0,CO
383,"Arapahoe County, CO",8005,2003.0,Drug poisonings (overdose) Unintentional (X40-...,33.0,CO


## Handle Data Types

In [7]:
# Check current data types
filtered_data.dtypes

county         object
county_code    object
year           object
cause          object
deaths         object
state          object
dtype: object

In [8]:
# Standardize county_code to 5-digit FIPS format
filtered_data["county_code"] = filtered_data["county_code"].astype(str).str.zfill(5)

# Standardize county names
filtered_data["county"] = filtered_data["county"].str.strip().str.title()

# Standardize state abbreviations
filtered_data["state"] = filtered_data["state"].str.strip().str.upper()

# Convert year to numeric and handle invalid values
filtered_data["year"] = pd.to_numeric(filtered_data["year"], errors="coerce").astype(
    "int"
)

# Convert deaths to numeric and handle invalid values
filtered_data["deaths"] = pd.to_numeric(
    filtered_data["deaths"], errors="coerce"
).astype(int)

# Check new data types
filtered_data.dtypes

county         object
county_code    object
year            int64
cause          object
deaths          int64
state          object
dtype: object

## Aggregate Annual Deaths by County

In [9]:
grouped_data = filtered_data.groupby(
    ["county_code", "county", "state", "year"], as_index=False
)["deaths"].sum()

# Check for duplicate rows in the grouped data
print(f"Number of duplicate rows: {grouped_data.duplicated().sum()}")

grouped_data.head()

Number of duplicate rows: 0


Unnamed: 0,county_code,county,state,year,deaths
0,1003,"Baldwin County, Al",AL,2003,10
1,1003,"Baldwin County, Al",AL,2004,18
2,1003,"Baldwin County, Al",AL,2005,14
3,1003,"Baldwin County, Al",AL,2006,11
4,1003,"Baldwin County, Al",AL,2007,24


## Save Data to Parquet

In [10]:
grouped_data.to_parquet("data/mortality_data.parquet", index=False)