In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in the csv
overdoses = pd.read_csv("merged_data.csv")
overdoses.head(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,397.0
1,,"Baldwin County, AL",1003.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0
2,,"Baldwin County, AL",1003.0,2003,2003.0,All other alcohol-induced causes,A9,14.0
3,,"Baldwin County, AL",1003.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1479.0
4,,"Barbour County, AL",1005.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,287.0
5,,"Bibb County, AL",1007.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,226.0
6,,"Blount County, AL",1009.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,552.0
7,,"Bullock County, AL",1011.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,121.0
8,,"Butler County, AL",1013.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,308.0
9,,"Calhoun County, AL",1015.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1286.0


In [3]:
overdoses.shape

(57436, 8)

In [4]:
overdoses["County"].head(10)

0    Autauga County, AL
1    Baldwin County, AL
2    Baldwin County, AL
3    Baldwin County, AL
4    Barbour County, AL
5       Bibb County, AL
6     Blount County, AL
7    Bullock County, AL
8     Butler County, AL
9    Calhoun County, AL
Name: County, dtype: object

In [7]:
# Here we subset by the policy states. We will need to add the control states.
selected_states = ["FL", "TX", "WA"]

filtered_df = overdoses[overdoses["County"].notna()]
filtered_df = filtered_df[filtered_df["County"].str.endswith(tuple(selected_states))]

filtered_df.head(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
500,,"Alachua County, FL",12001.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,11.0
501,,"Alachua County, FL",12001.0,2003,2003.0,All other alcohol-induced causes,A9,11.0
502,,"Alachua County, FL",12001.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1568.0
503,,"Baker County, FL",12003.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,200.0
504,,"Bay County, FL",12005.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,21.0
505,,"Bay County, FL",12005.0,2003,2003.0,All other alcohol-induced causes,A9,16.0
506,,"Bay County, FL",12005.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1405.0
507,,"Bradford County, FL",12007.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,243.0
508,,"Brevard County, FL",12009.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,83.0
509,,"Brevard County, FL",12009.0,2003,2003.0,Drug poisonings (overdose) Suicide (X60-X64),D2,14.0


In [8]:
# Checking that it was reduced
filtered_df.shape

(7216, 8)

In [11]:
# Put all the different categories into 1 series for further filtering
causes_of_death = filtered_df["Drug/Alcohol Induced Cause"].unique()
causes_of_death

array(['Drug poisonings (overdose) Unintentional (X40-X44)',
       'All other alcohol-induced causes',
       'All other non-drug and non-alcohol causes',
       'Drug poisonings (overdose) Suicide (X60-X64)',
       'All other drug-induced causes',
       'Drug poisonings (overdose) Undetermined (Y10-Y14)',
       'Alcohol poisonings (overdose) (X45, X65, Y15)'], dtype=object)

In [25]:
# We will add all 7 options for deaths into each county to begin accounting for missing deaths
all_combinations = pd.DataFrame(
    [
        (county, year, cause)
        for county in filtered_df["County"].unique()
        for year in filtered_df["Year"].unique()
        for cause in causes_of_death
    ],
    columns=["County", "Year", "Drug/Alcohol Induced Cause"],
)

# merge the datasets
merged_data = pd.merge(
    all_combinations,
    filtered_df,
    how="left",
    on=["County", "Year", "Drug/Alcohol Induced Cause"],
)

merged_data = merged_data.fillna(np.nan)

merged_data.head(10)

Unnamed: 0,County,Year,Drug/Alcohol Induced Cause,Notes,County Code,Year Code,Drug/Alcohol Induced Cause Code,Deaths
0,"Alachua County, FL",2003,Drug poisonings (overdose) Unintentional (X40-...,,12001.0,2003.0,D1,11.0
1,"Alachua County, FL",2003,All other alcohol-induced causes,,12001.0,2003.0,A9,11.0
2,"Alachua County, FL",2003,All other non-drug and non-alcohol causes,,12001.0,2003.0,O9,1568.0
3,"Alachua County, FL",2003,Drug poisonings (overdose) Suicide (X60-X64),,,,,
4,"Alachua County, FL",2003,All other drug-induced causes,,,,,
5,"Alachua County, FL",2003,Drug poisonings (overdose) Undetermined (Y10-Y14),,,,,
6,"Alachua County, FL",2003,"Alcohol poisonings (overdose) (X45, X65, Y15)",,,,,
7,"Alachua County, FL",2004,Drug poisonings (overdose) Unintentional (X40-...,,,,,
8,"Alachua County, FL",2004,All other alcohol-induced causes,,12001.0,2004.0,A9,19.0
9,"Alachua County, FL",2004,All other non-drug and non-alcohol causes,,12001.0,2004.0,O9,1511.0


In [23]:
# Only filled in data
filtered_df[filtered_df["County"] == "Alachua County, FL"].shape

(35, 8)

In [24]:
# 13 years * 7 causes of deaths  =  91 options for each county. This has been properly added.
merged_data[merged_data["County"] == "Alachua County, FL"].shape

(91, 8)

In [27]:
# Let's subset to just drug overdose categories.
# Still need to check drug/alcohol induced codes to hopefully filter to only opioids.
only_overdoses = [
    "Drug poisonings (overdose) Undetermined (Y10-Y14)",
    "Drug poisonings (overdose) Suicide (X60-X64)",
    "Drug poisonings (overdose) Unintentional (X40-X44)",
]

final_temp_df = merged_data[
    merged_data["Drug/Alcohol Induced Cause"].isin(only_overdoses)
]
final_temp_df.head(39)

Unnamed: 0,County,Year,Drug/Alcohol Induced Cause,Notes,County Code,Year Code,Drug/Alcohol Induced Cause Code,Deaths
0,"Alachua County, FL",2003,Drug poisonings (overdose) Unintentional (X40-...,,12001.0,2003.0,D1,11.0
3,"Alachua County, FL",2003,Drug poisonings (overdose) Suicide (X60-X64),,,,,
5,"Alachua County, FL",2003,Drug poisonings (overdose) Undetermined (Y10-Y14),,,,,
7,"Alachua County, FL",2004,Drug poisonings (overdose) Unintentional (X40-...,,,,,
10,"Alachua County, FL",2004,Drug poisonings (overdose) Suicide (X60-X64),,,,,
12,"Alachua County, FL",2004,Drug poisonings (overdose) Undetermined (Y10-Y14),,,,,
14,"Alachua County, FL",2005,Drug poisonings (overdose) Unintentional (X40-...,,,,,
17,"Alachua County, FL",2005,Drug poisonings (overdose) Suicide (X60-X64),,,,,
19,"Alachua County, FL",2005,Drug poisonings (overdose) Undetermined (Y10-Y14),,,,,
21,"Alachua County, FL",2006,Drug poisonings (overdose) Unintentional (X40-...,,,,,


### To-Do & Problems

1. We need to figure out a method to fill in all the NaNs with accurate counts of deaths for these other categories. 

2. If we find total deaths for counties, it seems like there are still a lot of categories that are unnaccounted for so it will be hard to divide these between different categories. 

3. Also need to add the control states.