***
## Data Cleaning  - US Vital Statistics
***

In [13]:
import vd_lib as vd
import pandas as pd
import numpy as np

# Load data
datasets = vd.read_data()

datasets[0].head()

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,397.0
1,,"Baldwin County, AL",1003,2003.0,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0
2,,"Baldwin County, AL",1003,2003.0,2003.0,All other alcohol-induced causes,A9,14.0
3,,"Baldwin County, AL",1003,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,1479.0
4,,"Barbour County, AL",1005,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,287.0


In [14]:
# Apply the function to the 'Drug/Alcohol Induced Cause' column
for i in range(len(datasets)):
    datasets[i]["Cause_of_Death"] = datasets[i]["Drug/Alcohol Induced Cause"].apply(
        vd.categorize_causes
    )

datasets[0][
    ["Drug/Alcohol Induced Cause", "Cause_of_Death"]
]  # Show the original and new categorized columns for comparison

Unnamed: 0,Drug/Alcohol Induced Cause,Cause_of_Death
0,All other non-drug and non-alcohol causes,Other
1,Drug poisonings (overdose) Unintentional (X40-...,Drug Overdose
2,All other alcohol-induced causes,Other
3,All other non-drug and non-alcohol causes,Other
4,All other non-drug and non-alcohol causes,Other
...,...,...
4082,All other non-drug and non-alcohol causes,Other
4083,All other non-drug and non-alcohol causes,Other
4084,All other non-drug and non-alcohol causes,Other
4085,All other non-drug and non-alcohol causes,Other


In [15]:
# Extract the state prefix, strip the comma and space, and concatenate with 'County Code'
for i in range(len(datasets)):
    datasets[i]["County_Code"] = (
        datasets[i]["County"].str.extract(r", (\w\w)")[0]
        + "-"
        + datasets[i]["County Code"].astype(str)
    )
    # Strip off the comma and state prefix from the 'County' column
    datasets[i]["County"] = datasets[i]["County"].str.replace(r", \w\w", "", regex=True)
    #cast year to int
    datasets[i]["Year"] = datasets[i]["Year"].astype(int)
   

datasets[0].head(5)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,Cause_of_Death,County_Code
0,,Autauga County,1001,2003,2003.0,All other non-drug and non-alcohol causes,O9,397.0,Other,AL-1001
1,,Baldwin County,1003,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0,Drug Overdose,AL-1003
2,,Baldwin County,1003,2003,2003.0,All other alcohol-induced causes,A9,14.0,Other,AL-1003
3,,Baldwin County,1003,2003,2003.0,All other non-drug and non-alcohol causes,O9,1479.0,Other,AL-1003
4,,Barbour County,1005,2003,2003.0,All other non-drug and non-alcohol causes,O9,287.0,Other,AL-1005


In [16]:
# Let’s drop the columns we don’t need to make them easier to work with.
for i in range(len(datasets)):
    datasets[i] = datasets[i].drop(
        [
            "Notes",
            "County Code",
            "Drug/Alcohol Induced Cause Code",
            "Year Code",
            "Drug/Alcohol Induced Cause",
        ],
        axis=1,
    )
datasets[0].head(5)

Unnamed: 0,County,Year,Deaths,Cause_of_Death,County_Code
0,Autauga County,2003,397.0,Other,AL-1001
1,Baldwin County,2003,10.0,Drug Overdose,AL-1003
2,Baldwin County,2003,14.0,Other,AL-1003
3,Baldwin County,2003,1479.0,Other,AL-1003
4,Barbour County,2003,287.0,Other,AL-1005


In [17]:
# Let's Subset for only Drug overdose deaths
# And drop Alaska in the process
for i in range(len(datasets)):
    datasets[i] = datasets[i][datasets[i]["Cause_of_Death"].str.contains("Overdose")]

    datasets[i] = datasets[i][~datasets[i]["County_Code"].str.startswith("AL-")]

datasets[0].head(5)

Unnamed: 0,County,Year,Deaths,Cause_of_Death,County_Code
78,Anchorage Borough,2003,31.0,Drug Overdose,AK-2020
91,Matanuska-Susitna Borough,2003,11.0,Drug Overdose,AK-2170
106,Cochise County,2003,11.0,Drug Overdose,AZ-4003
109,Coconino County,2003,11.0,Drug Overdose,AZ-4005
116,Maricopa County,2003,273.0,Drug Overdose,AZ-4013


## Cleaning Continues ...