In [19]:
import pandas as pd
import numpy as np

In [20]:
# read in the csv
overdoses = pd.read_csv("merged_data.csv")
overdoses.head(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
0,,"Autauga County, AL",1001.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,397.0
1,,"Baldwin County, AL",1003.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,10.0
2,,"Baldwin County, AL",1003.0,2003,2003.0,All other alcohol-induced causes,A9,14.0
3,,"Baldwin County, AL",1003.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1479.0
4,,"Barbour County, AL",1005.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,287.0
5,,"Bibb County, AL",1007.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,226.0
6,,"Blount County, AL",1009.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,552.0
7,,"Bullock County, AL",1011.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,121.0
8,,"Butler County, AL",1013.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,308.0
9,,"Calhoun County, AL",1015.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1286.0


In [21]:
overdoses.shape

(57436, 8)

In [22]:
overdoses["County"].head(10)

0    Autauga County, AL
1    Baldwin County, AL
2    Baldwin County, AL
3    Baldwin County, AL
4    Barbour County, AL
5       Bibb County, AL
6     Blount County, AL
7    Bullock County, AL
8     Butler County, AL
9    Calhoun County, AL
Name: County, dtype: object

In [23]:
# Here we subset by the policy states. We will need to add the control states.
selected_states = ["FL", "TX", "WA", "WV", "TN", "OH", "NE", "WY", "KS", "OR", "IL"]

filtered_df = overdoses[overdoses["County"].notna()]
filtered_df = filtered_df[filtered_df["County"].str.endswith(tuple(selected_states))]

filtered_df.head(10)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
500,,"Alachua County, FL",12001.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,11.0
501,,"Alachua County, FL",12001.0,2003,2003.0,All other alcohol-induced causes,A9,11.0
502,,"Alachua County, FL",12001.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1568.0
503,,"Baker County, FL",12003.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,200.0
504,,"Bay County, FL",12005.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,21.0
505,,"Bay County, FL",12005.0,2003,2003.0,All other alcohol-induced causes,A9,16.0
506,,"Bay County, FL",12005.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,1405.0
507,,"Bradford County, FL",12007.0,2003,2003.0,All other non-drug and non-alcohol causes,O9,243.0
508,,"Brevard County, FL",12009.0,2003,2003.0,Drug poisonings (overdose) Unintentional (X40-...,D1,83.0
509,,"Brevard County, FL",12009.0,2003,2003.0,Drug poisonings (overdose) Suicide (X60-X64),D2,14.0


In [24]:
# Checking that it was reduced
filtered_df.shape

(17217, 8)

In [25]:
filtered_df["County"].unique().shape

(951,)

In [26]:
# Put all the different categories into 1 series for further filtering
causes_of_death = filtered_df["Drug/Alcohol Induced Cause"].unique()
causes_of_death

array(['Drug poisonings (overdose) Unintentional (X40-X44)',
       'All other alcohol-induced causes',
       'All other non-drug and non-alcohol causes',
       'Drug poisonings (overdose) Suicide (X60-X64)',
       'All other drug-induced causes',
       'Drug poisonings (overdose) Undetermined (Y10-Y14)',
       'Alcohol poisonings (overdose) (X45, X65, Y15)'], dtype=object)

In [27]:
filtered_df = filtered_df.drop(columns=["Notes", "Drug/Alcohol Induced Cause Code"])

In [28]:
filtered_df.shape

(17217, 6)

In [29]:
## We see a county which contains overdoses in some years and others which don't.
print(filtered_df[filtered_df["County"] == "Pike County, OH"])

                County  County Code  Year  Year Code  \
2767   Pike County, OH      39131.0  2003     2003.0   
6883   Pike County, OH      39131.0  2004     2004.0   
11111  Pike County, OH      39131.0  2005     2005.0   
15352  Pike County, OH      39131.0  2006     2006.0   
19669  Pike County, OH      39131.0  2007     2007.0   
24042  Pike County, OH      39131.0  2008     2008.0   
28421  Pike County, OH      39131.0  2009     2009.0   
32839  Pike County, OH      39131.0  2010     2010.0   
37332  Pike County, OH      39131.0  2011     2011.0   
37333  Pike County, OH      39131.0  2011     2011.0   
41892  Pike County, OH      39131.0  2012     2012.0   
46476  Pike County, OH      39131.0  2013     2013.0   
51112  Pike County, OH      39131.0  2014     2014.0   
55851  Pike County, OH      39131.0  2015     2015.0   
55852  Pike County, OH      39131.0  2015     2015.0   

                              Drug/Alcohol Induced Cause Deaths  
2767           All other non-drug and

In [30]:
filtered_df["Drug/Alcohol Induced Cause"] = filtered_df[
    "Drug/Alcohol Induced Cause"
].replace(
    to_replace=[
        "Drug poisonings (overdose) Unintentional (X40-X44)",
        "Drug poisonings (overdose) Suicide (X60-X64)",
        "Drug poisonings (overdose) Undetermined (Y10-Y14)",
        "All other drug-induced causes",
    ],
    value="Drug Causes",
)

filtered_df[filtered_df["County"] == "Broward County, FL"]

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Deaths
512,"Broward County, FL",12011.0,2003,2003.0,Drug Causes,170.0
513,"Broward County, FL",12011.0,2003,2003.0,Drug Causes,60.0
514,"Broward County, FL",12011.0,2003,2003.0,All other alcohol-induced causes,93.0
515,"Broward County, FL",12011.0,2003,2003.0,All other non-drug and non-alcohol causes,15414.0
4618,"Broward County, FL",12011.0,2004,2004.0,Drug Causes,190.0
...,...,...,...,...,...,...
53245,"Broward County, FL",12011.0,2015,2015.0,Drug Causes,252
53246,"Broward County, FL",12011.0,2015,2015.0,Drug Causes,27
53247,"Broward County, FL",12011.0,2015,2015.0,"Alcohol poisonings (overdose) (X45, X65, Y15)",10
53248,"Broward County, FL",12011.0,2015,2015.0,All other alcohol-induced causes,157


In [31]:
filtered_df["Deaths"] = pd.to_numeric(filtered_df["Deaths"], errors="coerce")

# Filter rows where the cause is 'Drug Causes'
drug_causes_df = filtered_df[filtered_df["Drug/Alcohol Induced Cause"] == "Drug Causes"]

# Group by 'County' and 'Year', then sum the 'Deaths'
result_df = (
    drug_causes_df.groupby(["County", "Year", "County Code"])
    .agg({"Deaths": "sum"})
    .reset_index()
)
result_df[result_df["County"] == "Broward County, FL"]

Unnamed: 0,County,Year,County Code,Deaths
168,"Broward County, FL",2003,12011.0,230.0
169,"Broward County, FL",2004,12011.0,287.0
170,"Broward County, FL",2005,12011.0,320.0
171,"Broward County, FL",2006,12011.0,282.0
172,"Broward County, FL",2007,12011.0,282.0
173,"Broward County, FL",2008,12011.0,326.0
174,"Broward County, FL",2009,12011.0,305.0
175,"Broward County, FL",2010,12011.0,262.0
176,"Broward County, FL",2011,12011.0,233.0
177,"Broward County, FL",2012,12011.0,228.0


In [32]:
# 595 total counties
filtered_df["County"].unique().size

951

We have 1934 rows entries of observations. Given we have 595 counties, we should have data for each year for each county. Our range is 2003-2015 which is 12 years. 595 * 12 years = 7,140 total observations for drug deaths. 

In [33]:
import os

all_counties_years = filtered_df[["County", "Year"]].drop_duplicates()

# Merge the original DataFrame with the complete set
merged_df = pd.merge(
    all_counties_years, result_df, on=["County", "Year", "County Code"], how="left"
)

# Identify rows where there is no drug death entry
no_drug_death_entries = merged_df[merged_df["Deaths"].isna()]

# Print the result
print("Years with no drug death entry for each county:")
print(no_drug_death_entries[["County", "Year"]])

output_directory = "state_text_files"

# Create the directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

output_file_path = os.path.join("state_text_files", "counties_with_missingDeaths.txt")
no_drug_death_entries.to_csv(output_file_path, sep="\t", index=False)

Years with no drug death entry for each county:
                      County  Year  County Code
1           Baker County, FL  2003          NaN
3        Bradford County, FL  2003          NaN
6         Calhoun County, FL  2003          NaN
7       Charlotte County, FL  2003          NaN
11       Columbia County, FL  2003          NaN
...                      ...   ...          ...
12202  Sweetwater County, WY  2015          NaN
12203       Teton County, WY  2015          NaN
12204       Uinta County, WY  2015          NaN
12205    Washakie County, WY  2015          NaN
12206      Weston County, WY  2015          NaN

[9869 rows x 3 columns]


In [34]:
output_file_path1 = os.path.join("state_text_files", "drug_deaths.txt")
result_df.to_csv(output_file_path1, sep="\t", index=False)
result_df.head(10)

Unnamed: 0,County,Year,County Code,Deaths
0,"Adams County, IL",2015,17001.0,14.0
1,"Adams County, OH",2012,39001.0,10.0
2,"Adams County, OH",2014,39001.0,10.0
3,"Adams County, OH",2015,39001.0,13.0
4,"Alachua County, FL",2003,12001.0,11.0
5,"Alachua County, FL",2007,12001.0,17.0
6,"Alachua County, FL",2009,12001.0,15.0
7,"Alachua County, FL",2010,12001.0,15.0
8,"Alachua County, FL",2011,12001.0,14.0
9,"Alachua County, FL",2012,12001.0,14.0


In [35]:
unique_counties = merged_df["County"].unique()
unique_counties_df = pd.DataFrame({"County": unique_counties})

output_file_path2 = os.path.join("state_text_files", "all_counties.txt")
unique_counties_df.to_csv(output_file_path2, sep="\t", index=False)

### To-Do & Problems

> Still need to fill in missing data or fix missing data based on what we decide

> Convert this into  .py file once complete