To aggregate the deaths dataset

In [19]:
import pandas as pd

In [20]:
dataf = pd.read_csv("Underlying Cause of Death, 2003.txt", sep="\t", header=None)

In [21]:
dataf = dataf[dataf[2].notnull()]
dataf[5].value_counts()

5
All other non-drug and non-alcohol causes             3107
All other alcohol-induced causes                       402
Drug poisonings (overdose) Unintentional (X40-X44)     375
Drug poisonings (overdose) Suicide (X60-X64)            89
Drug poisonings (overdose) Undetermined (Y10-Y14)       57
All other drug-induced causes                           56
Drug/Alcohol Induced Cause                               1
Alcohol poisonings (overdose) (X45, X65, Y15)            1
Name: count, dtype: int64

In [22]:
required = (
    "Drug poisonings (overdose) Unintentional (X40-X44)",
    "Drug poisonings (overdose) Suicide (X60-X64)",
    "Drug poisonings (overdose) Undetermined (Y10-Y14)",
    "All other drug-induced causes",
    "Drug/Alcohol Induced Cause",
)

In [23]:
# i want 5 to only have required causes of death

dataf = dataf[dataf[5].isin(required)]

In [24]:
dataf.columns = dataf.iloc[0]
dataf = dataf[1:]

In [25]:
df_grouped = dataf.groupby(["County", "Year"])["Deaths"].sum().reset_index()

df_grouped.value_counts()

County                 Year    Deaths   
Acadia Parish, LA      2003.0  11.0         1
Osceola County, FL     2003.0  12.0         1
Pima County, AZ        2003.0  145.014.0    1
Pike County, KY        2003.0  19.0         1
Pierce County, WA      2003.0  77.010.0     1
                                           ..
Guilford County, NC    2003.0  19.010.0     1
Greenville County, SC  2003.0  32.0         1
Greene County, OH      2003.0  15.0         1
Greene County, MO      2003.0  40.0         1
York County, SC        2003.0  10.0         1
Name: count, Length: 409, dtype: int64

In [26]:
# change year and deaths column to int
dataf["Deaths"] = pd.to_numeric(dataf["Deaths"], errors="coerce")
dataf = dataf.dropna(subset=["Deaths"])
dataf["Deaths"] = dataf["Deaths"].astype(int)

In [27]:
def process_file(file_name):
    # read the file into a DataFrame
    df = pd.read_csv(file_name, sep="\t", header=None)

    # perform the necessary transformations

    df = df[df[5].isin(required)]
    df.columns = df.iloc[0]
    df = df[1:]
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"])
    df["Year"] = df["Year"].astype(float).astype(int)
    df["Deaths"] = pd.to_numeric(df["Deaths"], errors="coerce")
    df = df.dropna(subset=["Deaths"])
    df["Deaths"] = df["Deaths"].astype(int)
    df_grouped = df.groupby(["County", "Year"])["Deaths"].sum().reset_index()

    return df_grouped


# list of file names
file_names = [
    "Underlying Cause of Death, 2003.txt",
    "Underlying Cause of Death, 2004.txt",
    "Underlying Cause of Death, 2005.txt",
    "Underlying Cause of Death, 2006.txt",
    "Underlying Cause of Death, 2007.txt",
    "Underlying Cause of Death, 2008.txt",
    "Underlying Cause of Death, 2009.txt",
    "Underlying Cause of Death, 2010.txt",
    "Underlying Cause of Death, 2011.txt",
    "Underlying Cause of Death, 2012.txt",
    "Underlying Cause of Death, 2013.txt",
    "Underlying Cause of Death, 2014.txt",
    "Underlying Cause of Death, 2015.txt",
]

# apply the function to each file name
dfs = [process_file(file_name) for file_name in file_names]

# concatenate all the DataFrames into a single DataFrame
df_all = pd.concat(dfs)

In [28]:
df_all

Unnamed: 0,County,Year,Deaths
0,"Acadia Parish, LA",2003,11
1,"Ada County, ID",2003,17
2,"Adams County, CO",2003,42
3,"Aiken County, SC",2003,10
4,"Alachua County, FL",2003,11
...,...,...,...
789,"Yolo County, CA",2015,26
790,"York County, ME",2015,39
791,"York County, PA",2015,84
792,"York County, SC",2015,38


In [29]:
df_all

Unnamed: 0,County,Year,Deaths
0,"Acadia Parish, LA",2003,11
1,"Ada County, ID",2003,17
2,"Adams County, CO",2003,42
3,"Aiken County, SC",2003,10
4,"Alachua County, FL",2003,11
...,...,...,...
789,"Yolo County, CA",2015,26
790,"York County, ME",2015,39
791,"York County, PA",2015,84
792,"York County, SC",2015,38


In [31]:
df_all[["County", "State"]] = df_all["County"].str.split(",", expand=True)
df_all["State"] = df_all["State"].str.strip()

Unnamed: 0,County,Year,Deaths,State
0,Acadia Parish,2003,11,LA
1,Ada County,2003,17,ID
2,Adams County,2003,42,CO
3,Aiken County,2003,10,SC
4,Alachua County,2003,11,FL
...,...,...,...,...
789,Yolo County,2015,26,CA
790,York County,2015,39,ME
791,York County,2015,84,PA
792,York County,2015,38,SC


In [None]:
# save df_all to csv

df_all.to_csv("mortality.csv", index=False)

In [None]:
df_all["County"].value_counts()

County
St. Tammany Parish, LA    13
Multnomah County, OR      13
Norfolk County, MA        13
New York County, NY       13
New London County, CT     13
                          ..
Levy County, FL            1
Cooke County, TX           1
Midland County, MI         1
Okmulgee County, OK        1
Windsor County, VT         1
Name: count, Length: 1043, dtype: int64