In [1]:
# Impoting required packages
import pandas as pd
import numpy as np

# setting default option
pd.set_option("mode.copy_on_write", True)

In [2]:
# importing datasets
df = pd.read_parquet("../../Data/processed/mortality.parquet")
df.sample(5)

Unnamed: 0,State,County,County_Code,Year,Cause,Deaths
7014,LA,TERREBONNE,22109,2012,Drug poisonings (overdose) Unintentional (X40-...,18
7492,VA,RICHMOND CITY,51760,2012,Drug poisonings (overdose) Unintentional (X40-...,25
3731,MI,MACOMB,26099,2008,All other drug-induced causes,17
9722,IA,POLK,19153,2015,Drug poisonings (overdose) Suicide (X60-X64),11
9925,NJ,BERGEN,34003,2015,Drug poisonings (overdose) Unintentional (X40-...,82


In [3]:
population = pd.read_parquet("../../Data/processed/population.parquet")
population.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Population
2432,California,6,PLACER,6061,2004,302841
8466,Illinois,17,SCOTT,17171,2006,5392
10946,Iowa,19,SHELBY,19165,2003,12819
28054,Oklahoma,40,OSAGE,40113,2003,45680
18573,Mississippi,28,PIKE,28113,2012,40091


In [4]:
# Dropping Alaska
df = df[df["State"] != "AK"]

In [5]:
df["Cause"].value_counts()

Cause
Drug poisonings (overdose) Unintentional (X40-X44)    7538
Drug poisonings (overdose) Suicide (X60-X64)          1461
Drug poisonings (overdose) Undetermined (Y10-Y14)      757
All other drug-induced causes                          625
Drug poisonings (overdose) Homicide (X85)                2
Name: count, dtype: int64

In [6]:
# Dropping poisonong since we have only 2 values
df = df[df["Cause"] != "Drug poisonings (overdose) Homicide (X85)"]

In [7]:
df.isna().sum()

State          0
County         0
County_Code    0
Year           0
Cause          0
Deaths         8
dtype: int64

In [8]:
df[df["Deaths"].isna()]

Unnamed: 0,State,County,County_Code,Year,Cause,Deaths
10345,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Unintentional (X40-...,
10346,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Suicide (X60-X64),
10348,VA,BEDFORD CITY,51515,2015,Drug poisonings (overdose) Undetermined (Y10-Y14),
10349,VA,BEDFORD CITY,51515,2015,All other drug-induced causes,
10351,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Unintentional (X40-...,
10352,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Suicide (X60-X64),
10354,VA,CLIFTON FORGE CITY,51560,2015,Drug poisonings (overdose) Undetermined (Y10-Y14),
10355,VA,CLIFTON FORGE CITY,51560,2015,All other drug-induced causes,


In [9]:
# drop these NAs for now since they are all in VA and in 2015
df = df.dropna()

In [10]:
df["Deaths"].describe()

count      10373.0
mean      37.01475
std      53.134654
min           10.0
25%           13.0
50%           19.0
75%           36.0
max          705.0
Name: Deaths, dtype: Float64

In [11]:
population["Population"].describe()

count    4.049500e+04
mean     9.803144e+04
std      3.135551e+05
min      0.000000e+00
25%      1.126200e+04
50%      2.577600e+04
75%      6.644000e+04
max      1.007726e+07
Name: Population, dtype: float64

In [12]:
combined = pd.merge(
    df,
    population,
    on=["County_Code", "Year"],
    how="left",
    validate="m:1",
    indicator=True,
)
combined.sample(5)

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge
2904,ME,OXFORD,23017,2007,Drug poisonings (overdose) Unintentional (X40-...,10,Maine,23,OXFORD,57966,both
9112,OR,LINCOLN,41041,2014,Drug poisonings (overdose) Unintentional (X40-...,12,Oregon,41,LINCOLN,46383,both
5161,IL,SAINT CLAIR,17163,2010,Drug poisonings (overdose) Unintentional (X40-...,24,Illinois,17,SAINT CLAIR,270368,both
5816,CA,LOS ANGELES,6037,2011,Drug poisonings (overdose) Undetermined (Y10-Y14),17,California,6,LOS ANGELES,9873700,both
1861,WV,RALEIGH,54081,2005,Drug poisonings (overdose) Unintentional (X40-...,10,West Virginia,54,RALEIGH,78341,both


In [13]:
# check if any of them don't have a match
combined[combined["_merge"] == "left_only"]

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge


In [14]:
# check counties with no population data
combined["Population"].describe()

count    1.037300e+04
mean     5.888579e+05
std      9.580291e+05
min      1.028200e+04
25%      1.315990e+05
50%      2.834050e+05
75%      6.970800e+05
max      1.007726e+07
Name: Population, dtype: float64

In [15]:
combined["Deaths"].describe()

count      10373.0
mean      37.01475
std      53.134654
min           10.0
25%           13.0
50%           19.0
75%           36.0
max          705.0
Name: Deaths, dtype: Float64

In [16]:
combined.sample(5)

Unnamed: 0,State_x,County_x,County_Code,Year,Cause,Deaths,State_y,State_Code,County_y,Population,_merge
10186,TN,DAVIDSON,47037,2015,All other drug-induced causes,11,Tennessee,47,DAVIDSON,680397,both
8345,VT,CHITTENDEN,50007,2013,Drug poisonings (overdose) Unintentional (X40-...,14,Vermont,50,CHITTENDEN,159578,both
1546,MO,GREENE,29077,2005,Drug poisonings (overdose) Unintentional (X40-...,59,Missouri,29,GREENE,257282,both
3166,OR,MARION,41047,2007,Drug poisonings (overdose) Unintentional (X40-...,29,Oregon,41,MARION,306964,both
3393,AR,CRAIGHEAD,5031,2008,Drug poisonings (overdose) Unintentional (X40-...,15,Arkansas,5,CRAIGHEAD,93316,both


In [17]:
# clean combined as required
df2 = combined[
    [
        "State_y",
        "State_x",
        "County_x",
        "County_Code",
        "Year",
        "Cause",
        "Deaths",
        "Population",
    ]
]

In [18]:
df2 = df2.rename(
    columns={"State_y": "State", "State_x": "State_Code", "County_x": "County"}
)

In [19]:
df2.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,Deaths,Population
2538,Texas,TX,TOM GREEN,48451,2006,Drug poisonings (overdose) Unintentional (X40-...,10,106125
5702,Virginia,VA,CHESAPEAKE CITY,51550,2010,Drug poisonings (overdose) Unintentional (X40-...,12,223525
1194,Wisconsin,WI,MILWAUKEE,55079,2004,Drug poisonings (overdose) Unintentional (X40-...,130,936914
4264,Connecticut,CT,NEW HAVEN,9009,2009,Drug poisonings (overdose) Suicide (X60-X64),12,860025
1468,Louisiana,LA,SAINT TAMMANY,22103,2005,Drug poisonings (overdose) Unintentional (X40-...,55,217358


In [20]:
# Claculating Mortality Rate
df3 = df2.copy()
df3["Mortality_Rate"] = df3["Deaths"] / df3["Population"]

In [21]:
df3.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Cause,Deaths,Population,Mortality_Rate
2904,Maine,ME,OXFORD,23017,2007,Drug poisonings (overdose) Unintentional (X40-...,10,57966,0.000173
8813,Massachusetts,MA,BERKSHIRE,25003,2014,Drug poisonings (overdose) Unintentional (X40-...,24,128928,0.000186
4876,Virginia,VA,RICHMOND CITY,51760,2009,Drug poisonings (overdose) Unintentional (X40-...,19,203678,9.3e-05
8919,New Jersey,NJ,BERGEN,34003,2014,Drug poisonings (overdose) Unintentional (X40-...,79,923475,8.6e-05
4461,Maryland,MD,BALTIMORE,24005,2009,Drug poisonings (overdose) Unintentional (X40-...,14,801808,1.7e-05


In [22]:
# Calculating Mortality Rate for each casue at state level
df4 = (
    df3.groupby(["State", "Year", "Cause"])
    .agg({"Deaths": "sum", "Population": "sum"})
    .reset_index()
)

In [23]:
df4.sample(5)

Unnamed: 0,State,Year,Cause,Deaths,Population
1032,New York,2009,Drug poisonings (overdose) Undetermined (Y10-Y14),21,463883
747,Michigan,2008,All other drug-induced causes,298,4216091
706,Massachusetts,2009,Drug poisonings (overdose) Unintentional (X40-...,689,6419623
241,Delaware,2012,Drug poisonings (overdose) Unintentional (X40-...,111,915518
268,District of Columbia,2009,Drug poisonings (overdose) Unintentional (X40-...,14,592228


In [24]:
# check if all states have a rate for each year
df4[["State", "Year"]]["State"].value_counts()  # .sort_values().value_counts()

State
California              52
Michigan                52
Arizona                 52
Texas                   51
New York                49
Pennsylvania            49
Oregon                  49
Indiana                 49
Florida                 47
Utah                    46
Maryland                45
Ohio                    45
Massachusetts           43
Washington              42
Illinois                42
Minnesota               38
Wisconsin               36
Tennessee               36
Hawaii                  35
Oklahoma                35
Kentucky                34
District of Columbia    34
Colorado                33
Alabama                 32
Georgia                 32
Connecticut             32
Missouri                31
Nevada                  31
Louisiana               31
New Jersey              31
New Mexico              28
Virginia                25
Kansas                  25
North Carolina          25
Idaho                   25
South Carolina          24
Delaware              

In [33]:
df3[["State", "Cause"]].drop_duplicates()["State"].value_counts().value_counts()

count
4    29
3    10
2     6
1     5
Name: count, dtype: int64