In [1]:
# Impoting required packages
import pandas as pd
import numpy as np

# setting default option
pd.set_option("mode.copy_on_write", True)

In [2]:
# importing datasets
df = pd.read_parquet("../../Data/processed/mortality.parquet")
df.sample(5)

Unnamed: 0,State,County,County_Code,Year,Cause,Deaths
8599,DE,NEW CASTLE,10003,2014,Drug poisonings (overdose) Suicide (X60-X64),16
6549,TX,DENTON,48121,2011,Drug poisonings (overdose) Unintentional (X40-...,59
3651,KY,JEFFERSON,21111,2008,Drug poisonings (overdose) Unintentional (X40-...,78
9216,SC,GREENWOOD,45047,2014,Drug poisonings (overdose) Unintentional (X40-...,10
1779,TX,BEXAR,48029,2005,Drug poisonings (overdose) Unintentional (X40-...,140


In [3]:
population = pd.read_parquet("../../Data/processed/population.parquet")
population.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Population
6591,Georgia,13,WASHINGTON,13303,2003,21110
37160,Virginia,51,PATRICK,51141,2009,18562
5646,Georgia,13,IRWIN,13155,2007,9493
33288,Texas,48,EASTLAND,48133,2011,18583
25527,North Dakota,38,BENSON,38005,2011,6688


In [4]:
# Dropping Alaska
df = df[df["State"] != "AK"]

In [5]:
df["Cause"].value_counts()

Cause
Drug poisonings (overdose) Unintentional (X40-X44)    7538
Drug poisonings (overdose) Suicide (X60-X64)          1461
Drug poisonings (overdose) Undetermined (Y10-Y14)      757
All other drug-induced causes                          625
Drug poisonings (overdose) Homicide (X85)                2
Name: count, dtype: int64

In [6]:
# Dropping poisonong since we have only 2 values
df = df[df["Cause"] != "Drug poisonings (overdose) Homicide (X85)"]

In [7]:
df.isna().sum()

State          0
County         0
County_Code    0
Year           0
Cause          0
Deaths         8
dtype: int64

In [8]:
df[df["Deaths"].isna()]

Unnamed: 0,State,County,County_Code,Year,Cause,Deaths
10345,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Unintentional (X40-...,
10346,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Suicide (X60-X64),
10348,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Undetermined (Y10-Y14),
10349,VA,BEDFORD CITY,51515,2015,All other drug-induced causes,
10351,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Unintentional (X40-...,
10352,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Suicide (X60-X64),
10354,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Undetermined (Y10-Y14),
10355,VA,CLIFTON FORGE CITY,51560,2015,All other drug-induced causes,


In [9]:
# drop these NAs for now since they are all in VA and in 2015
df = df.dropna()

In [10]:
df["Deaths"].describe()

count      10373.0
mean      37.01475
std      53.134654
min           10.0
25%           13.0
50%           19.0
75%           36.0
max          705.0
Name: Deaths, dtype: Float64

In [11]:
population["Population"].describe()

count    4.049500e+04
mean     9.803144e+04
std      3.135551e+05
min      0.000000e+00
25%      1.126200e+04
50%      2.577600e+04
75%      6.644000e+04
max      1.007726e+07
Name: Population, dtype: float64

In [12]:
combined = pd.merge(
    df,
    population,
    on=["County_Code", "Year"],
    how="left",
    validate="m:1",
    indicator=True,
)
combined.sample(5)

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge
643,CA,SAN JOAQUIN,6077,2004,Drug poisonings (overdose) Unintentional (X40-...,51,California,6,SAN JOAQUIN,642898,both
9602,GA,ROCKDALE,13247,2015,Drug poisonings (overdose) Unintentional (X40-...,10,Georgia,13,ROCKDALE,88424,both
9356,WI,JEFFERSON,55055,2014,Drug poisonings (overdose) Unintentional (X40-...,13,Wisconsin,55,JEFFERSON,84370,both
1744,SC,GREENVILLE,45045,2005,Drug poisonings (overdose) Suicide (X60-X64),16,South Carolina,45,GREENVILLE,405608,both
2710,CA,YOLO,6113,2007,Drug poisonings (overdose) Unintentional (X40-...,10,California,6,YOLO,194339,both


In [13]:
# check if any of them don't have a match
combined[combined["_merge"] == "left_only"]

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge


In [14]:
# check counties with no population data
combined["Population"].describe()

count    1.037300e+04
mean     5.888579e+05
std      9.580291e+05
min      1.028200e+04
25%      1.315990e+05
50%      2.834050e+05
75%      6.970800e+05
max      1.007726e+07
Name: Population, dtype: float64

In [15]:
combined["Deaths"].describe()

count      10373.0
mean      37.01475
std      53.134654
min           10.0
25%           13.0
50%           19.0
75%           36.0
max          705.0
Name: Deaths, dtype: Float64

In [16]:
combined.sample(5)

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge
6819,FL,NASSAU,12089,2012,Drug poisonings (overdose) Unintentional (X40-...,16,Florida,12,NASSAU,74546,both
7545,AZ,COCHISE,4003,2013,Drug poisonings (overdose) Unintentional (X40-...,11,Arizona,4,COCHISE,129664,both
624,CA,ORANGE,6059,2004,All other drug-induced causes,14,California,6,ORANGE,2941711,both
9226,TN,WILSON,47189,2014,Drug poisonings (overdose) Unintentional (X40-...,20,Tennessee,47,WILSON,125253,both
3188,PA,ERIE,42049,2007,Drug poisonings (overdose) Unintentional (X40-...,26,Pennsylvania,42,ERIE,278573,both


In [17]:
# clean combined as required
df2 = combined[
    [
        "State_y",
        "State_x",
        "County_x",
        "County_Code",
        "Year",
        "Cause",
        "Deaths",
        "Population",
    ]
]

In [18]:
df2 = df2.rename(
    columns={"State_y": "State", "State_x": "State_Code", "County_x": "County"}
)

In [19]:
df2.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,Deaths,Population
3614,Indiana,IN,VANDERBURGH,18163,2008,Drug poisonings (overdose) Suicide (X60-X64),10,178053
2605,Wisconsin,WI,ROCK,55105,2006,Drug poisonings (overdose) Unintentional (X40-...,20,158538
1360,Florida,FL,PINELLAS,12103,2005,Drug poisonings (overdose) Unintentional (X40-...,145,929426
10253,Texas,TX,RANDALL,48381,2015,Drug poisonings (overdose) Unintentional (X40-...,11,130463
8716,Indiana,IN,LAKE,18089,2014,Drug poisonings (overdose) Unintentional (X40-...,39,491259


In [20]:
# Claculating Mortality Rate
df3 = df2.copy()
df3["Mortality_Rate"] = df3["Deaths"] / df3["Population"]

In [21]:
df3.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,Deaths,Population,Mortality_Rate
413,Oregon,OR,CLACKAMAS,41005,2003,All other drug-induced causes,10,352032,2.8e-05
2862,Iowa,IA,POLK,19153,2007,Drug poisonings (overdose) Unintentional (X40-...,22,413024,5.3e-05
4808,Texas,TX,BEXAR,48029,2009,Drug poisonings (overdose) Suicide (X60-X64),25,1685628,1.5e-05
1382,Georgia,GA,DEKALB,13089,2005,Drug poisonings (overdose) Unintentional (X40-...,22,668998,3.3e-05
3889,Ohio,OH,CLERMONT,39025,2008,Drug poisonings (overdose) Unintentional (X40-...,39,195891,0.000199


In [22]:
# Calculating Mortality Rate for each casue at state level
df4 = (
    df3.groupby(["State", "State_Code", "Year", "Cause"])
    .agg({"Deaths": "sum", "Population": "sum"})
    .reset_index()
)

In [23]:
df4.sample(5)

Unnamed: 0,State,State_Code,Year,Cause,Deaths,Population
453,Illinois,IL,2013,Drug poisonings (overdose) Suicide (X60-X64),97,6868534
877,Montana,MT,2014,All other drug-induced causes,12,65764
1048,New York,NY,2013,Drug poisonings (overdose) Undetermined (Y10-Y14),55,871124
332,Georgia,GA,2004,All other drug-induced causes,11,809481
337,Georgia,GA,2005,Drug poisonings (overdose) Unintentional (X40-...,359,4837328


In [24]:
# check if all states have a rate for each year
df4[["State", "Year"]]["State"].value_counts()  # .sort_values().value_counts()

State
California              52
Michigan                52
Arizona                 52
Texas                   51
New York                49
Pennsylvania            49
Oregon                  49
Indiana                 49
Florida                 47
Utah                    46
Maryland                45
Ohio                    45
Massachusetts           43
Washington              42
Illinois                42
Minnesota               38
Wisconsin               36
Tennessee               36
Hawaii                  35
Oklahoma                35
Kentucky                34
District of Columbia    34
Colorado                33
Alabama                 32
Georgia                 32
Connecticut             32
Missouri                31
Nevada                  31
Louisiana               31
New Jersey              31
New Mexico              28
Virginia                25
Kansas                  25
North Carolina          25
Idaho                   25
South Carolina          24
Delaware              

In [25]:
df3[["State", "Cause"]].drop_duplicates()["State"].value_counts().value_counts()

count
4    29
3    10
2     6
1     5
Name: count, dtype: int64

In [26]:
df4["State_Mortality_Rate"] = df4["Deaths"] / df4["Population"]

In [27]:
# combinations of state and county
st_county = df3[["State", "State_Code", "County", "County_Code"]].drop_duplicates()

In [28]:
# create cross join on state level and county
master = pd.merge(
    df4, st_county, on=["State", "State_Code"], how="left", indicator=True
)

In [29]:
master.sample(5)

Unnamed: 0,State,State_Code,Year,Cause,Deaths,Population,State_Mortality_Rate,County,County_Code,_merge
27670,Oregon,OR,2014,All other drug-induced causes,31,1137275,2.7e-05,CLATSOP,41007,both
19213,New Jersey,NJ,2005,Drug poisonings (overdose) Unintentional (X40-...,686,8264855,8.3e-05,HUNTERDON,34019,both
30822,South Carolina,SC,2015,Drug poisonings (overdose) Unintentional (X40-...,577,3606472,0.00016,SPARTANBURG,45083,both
2474,California,CA,2007,Drug poisonings (overdose) Unintentional (X40-...,2992,35482979,8.4e-05,MADERA,6039,both
36997,Washington,WA,2005,Drug poisonings (overdose) Unintentional (X40-...,600,5499086,0.000109,THURSTON,53067,both


In [30]:
master_2 = master[
    [
        "State",
        "State_Code",
        "County",
        "County_Code",
        "Year",
        "Cause",
        "State_Mortality_Rate",
    ]
]

In [31]:
master_2.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,State_Mortality_Rate
16421,Michigan,MI,INGHAM,26065,2010,Drug poisonings (overdose) Undetermined (Y10-Y14),2.8e-05
5540,Florida,FL,PALM BEACH,12099,2007,Drug poisonings (overdose) Unintentional (X40-...,0.00013
7938,Georgia,GA,MADISON,13195,2014,All other drug-induced causes,1.3e-05
30156,Pennsylvania,PA,CARBON,42025,2015,Drug poisonings (overdose) Undetermined (Y10-Y14),4.1e-05
32963,Texas,TX,MONTGOMERY,48339,2005,All other drug-induced causes,1.2e-05


In [32]:
# merge with the original data
df5 = pd.merge(
    master_2,
    df3,
    on=["State", "State_Code", "County", "County_Code", "Year", "Cause"],
    how="left",
    indicator=True,
    validate="1:1",
)

In [33]:
df5.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,State_Mortality_Rate,Deaths,Population,Mortality_Rate,_merge
33670,Texas,TX,ELLIS,48139,2008,All other drug-induced causes,8e-06,,,,left_only
3624,California,CA,MADERA,6039,2014,All other drug-induced causes,1e-05,,,,left_only
4752,Florida,FL,HIGHLANDS,12055,2003,All other drug-induced causes,8e-06,,,,left_only
27944,Pennsylvania,PA,PHILADELPHIA,42101,2003,Drug poisonings (overdose) Unintentional (X40-...,0.000102,238.0,1493802.0,0.000159,both
26436,Oklahoma,OK,DELAWARE,40041,2006,Drug poisonings (overdose) Suicide (X60-X64),2.5e-05,,,,left_only


In [34]:
# Remap with population data to get county population
df6 = pd.merge(
    df5,
    population[["County_Code", "Year", "Population"]],
    on=["County_Code", "Year"],
    how="left",
    validate="m:1",
    indicator="merge2",
)

In [35]:
df6.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,State_Mortality_Rate,Deaths,Population_x,Mortality_Rate,_merge,Population_y,merge2
9417,Illinois,IL,MARION,17121,2014,Drug poisonings (overdose) Unintentional (X40-...,0.000115,,,,left_only,38573,both
2442,California,CA,SAN LUIS OBISPO,6079,2007,Drug poisonings (overdose) Undetermined (Y10-Y14),6e-06,,,,left_only,262770,both
26841,Oklahoma,OK,DELAWARE,40041,2012,Drug poisonings (overdose) Undetermined (Y10-Y14),2.6e-05,,,,left_only,41733,both
34047,Texas,TX,ARANSAS,48007,2009,Drug poisonings (overdose) Unintentional (X40-...,8.5e-05,,,,left_only,23291,both
25849,Ohio,OH,RICHLAND,39139,2014,Drug poisonings (overdose) Unintentional (X40-...,0.000234,31.0,122030.0,0.000254,both,122030,both


In [36]:
df6["merge2"].value_counts()

merge2
both          38783
left_only         0
right_only        0
Name: count, dtype: int64

In [37]:
def new_death(row):
    if pd.isna(row["Deaths"]):
        return min(int(row["Population_y"] * row["State_Mortality_Rate"]), 9)
    else:
        return row["Deaths"]

In [38]:
df6["Deaths_2"] = df6.apply(new_death, axis=1)

In [39]:
df6.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,State_Mortality_Rate,Deaths,Population_x,Mortality_Rate,_merge,Population_y,merge2,Deaths_2
5379,Florida,FL,PUTNAM,12107,2006,Drug poisonings (overdose) Unintentional (X40-...,0.000122,,,,left_only,74663,both,9
5106,Florida,FL,MONROE,12087,2005,Drug poisonings (overdose) Suicide (X60-X64),2.1e-05,,,,left_only,75819,both,1
12152,Kentucky,KY,MCCRACKEN,21145,2007,Drug poisonings (overdose) Unintentional (X40-...,0.000156,,,,left_only,64925,both,9
17919,Missouri,MO,JEFFERSON,29099,2007,Drug poisonings (overdose) Unintentional (X40-...,0.000106,19.0,214948.0,8.8e-05,both,214948,both,19
33247,Texas,TX,TARRANT,48439,2006,Drug poisonings (overdose) Suicide (X60-X64),1.3e-05,12.0,1662005.0,7e-06,both,1662005,both,12


In [40]:
df7 = df6[
    [
        "State",
        "State_Code",
        "County",
        "County_Code",
        "Year",
        "Cause",
        "Deaths_2",
        "Population_y",
    ]
]

df7 = df7.rename(columns={"Population_y": "Population", "Deaths_2": "Deaths"})

In [41]:
df8 = (
    df7.groupby(["State", "State_Code", "County", "County_Code", "Year"])
    .agg({"Deaths": "sum", "Population": "mean"})
    .reset_index()
)

In [42]:
df8.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Deaths,Population
13090,West Virginia,WV,WAYNE,54099,2012,10,41933.0
10755,Tennessee,TN,HARDIN,47071,2006,5,25863.0
9263,Oklahoma,OK,ROGERS,40131,2006,13,82234.0
6170,Missouri,MO,WARREN,29219,2011,4,32615.0
3034,Illinois,IL,TAZEWELL,17179,2008,18,133893.0


In [43]:
df8["Mortality_Rate"] = df8["Deaths"] / df8["Population"]

In [44]:
# write to parquet
df8.to_parquet("../../Data/processed/mortality_corrected.parquet", index=False)