In [1]:
# Importing Libraries
import pandas as pd
import numpy as np

In [2]:
#######################################################

In [3]:
# Population Data
pop_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/population_cleaning/20_Intermediate_Files/Cleaned_Population_Data.csv"
pop_data = pd.read_csv(pop_data_path)
pop_data["Year"] = pop_data["Year"].str[:4]
pop_data["State_CD"] = pop_data["County"].str[-2:]
pop_data["County Code"] = pop_data["County Code"].astype(str)
pop_data.shape
# (97519, 7)
pop_data.head()

Unnamed: 0,Year,State,State Code,County,County Code,Population,Population_filled,State_CD
0,1990,Alabama,1,"Autauga County, AL",1001,34353.0,34353.0,AL
1,1990,Ohio,39,"Lorain County, OH",39093,271711.0,271711.0,OH
2,1990,Ohio,39,"Lucas County, OH",39095,462634.0,462634.0,OH
3,1990,Ohio,39,"Madison County, OH",39097,37111.0,37111.0,OH
4,1990,Ohio,39,"Mahoning County, OH",39099,265095.0,265095.0,OH


In [4]:
# Deaths Data
dea_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/death_cleaning/20_Intermediate_Files/Deaths.csv"
dea_data = pd.read_csv(dea_data_path)
dea_data["Year"] = dea_data["Year"].astype(str)
dea_data["County Code"] = dea_data["County Code"].astype(str)
dea_data.shape
# (7888, 5)
dea_data.head()

Unnamed: 0,Year,County Code,County,State,Deaths
0,2003,10001,"Kent County, DE",DE,10.0
1,2003,10003,"New Castle County, DE",DE,54.0
2,2003,1003,"Baldwin County, AL",AL,10.0
3,2003,1073,"Jefferson County, AL",AL,69.0
4,2003,1097,"Mobile County, AL",AL,26.0


In [5]:
# Dosages Data
dos_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/data_merging/20_Intermediate_Files/Dosage_with_CountyCode.csv"
dos_data = pd.read_csv(dos_data_path)
dos_data["transaction_year"] = dos_data["transaction_year"].astype(str)
dos_data["fips"] = dos_data["fips"].astype(str)
dos_data.shape
# (13092, 5)
dos_data.head()

Unnamed: 0,state,name,MME,transaction_year,fips
0,OR,washington,151346100.0,2015,41067
1,OR,multnomah,278541200.0,2015,41051
2,OR,marion,117917400.0,2015,41047
3,OR,lane,197906600.0,2015,41039
4,OR,tillamook,19689170.0,2015,41057


In [6]:
#######################################################

In [7]:
fl_pop_data_test_and_control = pop_data[
    pop_data["State"].isin(["Florida", "Georgia", "Alabama", "Tennessee"])
]
# Checks

# fl_pop_data_test_and_control["State"].unique()
# array(['Alabama', 'Tennessee', 'Georgia', 'Florida'], dtype=object)

# fl_pop_data_test_and_control["State"].value_counts()
# State
# Georgia      4929
# Tennessee    2945
# Alabama      2077
# Florida      2077
# Name: count, dtype: int64

In [8]:
fl_dea_data_test_and_control = dea_data[
    dea_data["State"].isin(["FL", "GA", "AL", "TN"])
]
# Checks

# fl_dea_data_test_and_control["State"].unique()
# array(['AL', 'FL', 'GA', 'TN'], dtype=object)

# fl_dea_data_test_and_control["State"].value_counts()
# State
# FL    458
# GA    308
# TN    273
# AL    166
# Name: count, dtype: int64

In [9]:
fl_dos_data_test_and_control = dos_data[
    dos_data["state"].isin(["FL", "GA", "TN", "AL"])
]
# Checks

# fl_dos_data_test_and_control["state"].unique()
# array(['ID', 'OK', 'OR', 'WA'], dtype=object)

# fl_dos_data_test_and_control["state"].value_counts()
# state
# GA    2122
# TN    1326
# AL     938
# FL     935
# Name: count, dtype: int64

In [10]:
# Merging Population and Deaths Data
fl_pop_dea_merged = fl_pop_data_test_and_control.merge(
    fl_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

fl_pop_data_test_and_control.shape
# (12028, 8)
fl_pop_dea_merged.shape
# (12028, 11)
fl_dea_data_test_and_control["Deaths"].sum()
# 52033.0
fl_pop_dea_merged["Deaths"].sum()
# 52033.0

52033.0

In [11]:
# Merging Population + Deaths and Dosages Data

fl_pop_dea_dos_merged = fl_pop_dea_merged.merge(
    fl_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "transaction_year"],
)

# Checks
fl_pop_dea_merged.shape
# (12028, 11)
fl_pop_dea_dos_merged.shape
# (12028, 16)
fl_dos_data_test_and_control["MME"].sum()
# 264345521881.9433
fl_pop_dea_dos_merged["MME"].sum()
# 264345521881.94333

264345521881.94333

In [12]:
# Filling the Dosage Null Values with 0
fl_pop_dea_dos_merged["MME"] = fl_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
fl_pop_dea_dos = fl_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
]
fl_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# fl_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = fl_pop_dea_dos[
    (fl_pop_dea_dos["Dosage"] > 0) & (fl_pop_dea_dos["Deaths"] > 0)
]

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
# Merge the new column back to the original DataFrame
fl_pop_dea_dos = fl_pop_dea_dos.merge(
    filtered_df[["State Code", "Year", "County Code", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
fl_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (fl_pop_dea_dos["Avg_Deaths_Per_Population"] * fl_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
fl_pop_dea_dos[
    (
        fl_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & fl_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fl_pop_dea_dos.rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[


Deaths_Per_Population_Times_Population
2.0    16204
1.0    14640
3.0    11420
9.0     6628
4.0     6609
5.0     4505
6.0     3705
7.0     2623
0.0     1965
8.0     1737
Name: count, dtype: int64

In [13]:
fl_pop_dea_dos["Deaths"] = fl_pop_dea_dos["Deaths"].fillna(
    fl_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
fl_pop_dea_dos_final = fl_pop_dea_dos[
    ["Year", "State Code", "County", "County Code_x", "Population", "Deaths", "Dosage"]
]
fl_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
fl_pop_dea_dos_final.to_csv("../20_Intermediate_Files/Florida_Merged.csv", index=False)
fl_pop_dea_dos_final.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fl_pop_dea_dos_final.rename(


Unnamed: 0,Year,State Code,County,County.1,Population,Deaths,Dosage
0,1990,AL,"Autauga County, AL",1001,34353.0,,0.0
1,1990,TN,"Madison County, TN",47113,78345.0,,0.0
2,1990,TN,"Marion County, TN",47115,24712.0,,0.0
3,1990,TN,"Marshall County, TN",47117,21741.0,,0.0
4,1990,TN,"Maury County, TN",47119,55262.0,,0.0


In [14]:
#######################################################

In [15]:
wa_pop_data_test_and_control = pop_data[
    pop_data["State"].isin(["Washington", "Oregon", "Idaho", "Oklahoma"])
]

# Checks
# wa_pop_data_test_and_control["State"].unique()
# array(['Oklahoma', 'Texas', 'Arkansas', 'Louisiana'], dtype=object)

# wa_pop_data_test_and_control["State"].value_counts()
# State
# Oklahoma      2387
# Idaho         1364
# Washington    1209
# Oregon        1116
# Name: count, dtype: int64

In [16]:
wa_dea_data_test_and_control = dea_data[
    dea_data["State"].isin(["WA", "OK", "ID", "OR"])
]
# Checks

# wa_dea_data_test_and_control["State"].unique()
# array(['ID', 'OK', 'OR', 'WA'], dtype=object)

# wa_dea_data_test_and_control["State"].value_counts()
# State
# WA    198
# OK    141
# OR    102
# ID     38
# Name: count, dtype: int64

In [17]:
wa_dos_data_test_and_control = dos_data[
    dos_data["state"].isin(["WA", "OK", "ID", "OR"])
]
# Checks

# wa_dos_data_test_and_control["state"].unique()
# array(['OR', 'ID', 'OK', 'WA'], dtype=object)

# wa_dos_data_test_and_control["state"].value_counts()
# state
# OK    1078
# ID     571
# WA     546
# OR     484
# Name: count, dtype: int64

In [18]:
# Merging Population and Deaths Data
wa_pop_dea_merged = wa_pop_data_test_and_control.merge(
    wa_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

wa_pop_data_test_and_control.shape
# (6076, 8)
wa_pop_dea_merged.shape
# (6076, 11)
wa_dea_data_test_and_control["Deaths"].sum()
# 20362.0
wa_pop_dea_merged["Deaths"].sum()
# 20362.0

20362.0

In [19]:
# Merging Population + Deaths and Dosages Data

wa_pop_dea_dos_merged = wa_pop_dea_merged.merge(
    wa_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "transaction_year"],
)

# Checks
wa_pop_dea_merged.shape
# (6076, 11)
wa_pop_dea_dos_merged.shape
# (6076, 16)
wa_dos_data_test_and_control["MME"].sum()
# 87252759791.51248
wa_pop_dea_dos_merged["MME"].sum()
# 87252759791.51248

87252759791.51248

In [20]:
# Filling the Dosage Null Values with 0
wa_pop_dea_dos_merged["MME"] = wa_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
wa_pop_dea_dos = wa_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
]
wa_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# wa_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = wa_pop_dea_dos[
    (wa_pop_dea_dos["Dosage"] > 0) & (wa_pop_dea_dos["Deaths"] > 0)
]

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
# Merge the new column back to the original DataFrame
wa_pop_dea_dos = wa_pop_dea_dos.merge(
    filtered_df[["State Code", "Year", "County Code", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
wa_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (wa_pop_dea_dos["Avg_Deaths_Per_Population"] * wa_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
wa_pop_dea_dos[
    (
        wa_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & wa_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wa_pop_dea_dos.rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[


Deaths_Per_Population_Times_Population
1.0    3221
2.0    2665
9.0    2260
3.0    2185
6.0    1048
4.0     919
5.0     893
8.0     857
0.0     756
7.0     547
Name: count, dtype: int64

In [21]:
wa_pop_dea_dos["Deaths"] = wa_pop_dea_dos["Deaths"].fillna(
    wa_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
wa_pop_dea_dos_final = wa_pop_dea_dos[
    ["Year", "State Code", "County", "County Code_x", "Population", "Deaths", "Dosage"]
]
wa_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
wa_pop_dea_dos_final.to_csv(
    "../20_Intermediate_Files/Washington_Merged.csv", index=False
)
wa_pop_dea_dos_final.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wa_pop_dea_dos_final.rename(


Unnamed: 0,Year,State Code,County,County.1,Population,Deaths,Dosage
0,1990,OK,"Caddo County, OK",40015,29421.0,,0.0
1,1990,OK,"Canadian County, OK",40017,74662.0,,0.0
2,1990,OK,"Carter County, OK",40019,42899.0,,0.0
3,1990,OK,"Cherokee County, OK",40021,34170.0,,0.0
4,1990,OK,"Choctaw County, OK",40023,15317.0,,0.0


In [22]:
#######################################################

In [23]:
tx_pop_data_test_and_control = pop_data[
    pop_data["State"].isin(["Texas", "Oklahoma", "Arkansas", "Louisiana"])
]

# Checks

# tx_pop_data_test_and_control["State"].unique()
# array(['Oklahoma', 'Texas', 'Arkansas', 'Louisiana'], dtype=object)

# tx_pop_data_test_and_control["State"].value_counts()
# State
# Texas        7874
# Oklahoma     2387
# Arkansas     2325
# Louisiana    1984
# Name: count, dtype: int64

In [24]:
tx_dea_data_test_and_control = dea_data[
    dea_data["State"].isin(["TX", "OK", "AR", "LA"])
]

# Checks
# tx_dea_data_test_and_control["State"].unique()
# array(['LA', 'OK', 'TX', 'AR'], dtype=object)

# tx_dea_data_test_and_control["State"].value_counts()
# State
# TX    432
# LA    203
# OK    141
# AR     71
# Name: count, dtype: int64

In [25]:
tx_dos_data_test_and_control = dos_data[
    dos_data["state"].isin(["TX", "OK", "AR", "LA"])
]
# Checks

# tx_dos_data_test_and_control["state"].unique()
# array(['ID', 'OK', 'OR', 'WA'], dtype=object)

# tx_dos_data_test_and_control["state"].value_counts()
# state
# TX    3160
# OK    1078
# AR    1046
# LA     886
# Name: count, dtype: int64

In [26]:
# Merging Population and Deaths Data
tx_pop_dea_merged = tx_pop_data_test_and_control.merge(
    tx_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

tx_pop_data_test_and_control.shape
# (14570, 8)
tx_pop_dea_merged.shape
# (14570, 11)
tx_dea_data_test_and_control["Deaths"].sum()
# 35067.0
tx_pop_dea_merged["Deaths"].sum()
# 35067.0

35067.0

In [27]:
# Merging Population + Deaths and Dosages Data

tx_pop_dea_dos_merged = tx_pop_dea_merged.merge(
    tx_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "transaction_year"],
)

# Checks
tx_pop_dea_merged.shape
# (14570, 11)
tx_pop_dea_dos_merged.shape
# (14570, 16)
tx_dos_data_test_and_control["MME"].sum()
# 135540889564.20512
tx_pop_dea_dos_merged["MME"].sum()
# 135540889564.20511

135540889564.20511

In [28]:
# Filling the Dosage Null Values with 0
tx_pop_dea_dos_merged["MME"] = tx_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
tx_pop_dea_dos = tx_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
]
tx_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# tx_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = tx_pop_dea_dos[
    (tx_pop_dea_dos["Dosage"] > 0) & (tx_pop_dea_dos["Deaths"] > 0)
]

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
# Merge the new column back to the original DataFrame
tx_pop_dea_dos = tx_pop_dea_dos.merge(
    filtered_df[["State Code", "Year", "County Code", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
tx_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (tx_pop_dea_dos["Avg_Deaths_Per_Population"] * tx_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
tx_pop_dea_dos[
    (
        tx_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & tx_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tx_pop_dea_dos.rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[


Deaths_Per_Population_Times_Population
1.0    27180
0.0    18966
2.0    17755
3.0     9284
4.0     7282
9.0     5244
5.0     4458
6.0     2716
7.0     2365
8.0     2043
Name: count, dtype: int64

In [29]:
tx_pop_dea_dos["Deaths"] = tx_pop_dea_dos["Deaths"].fillna(
    tx_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
tx_pop_dea_dos_final = tx_pop_dea_dos[
    ["Year", "State Code", "County", "County Code_x", "Population", "Deaths", "Dosage"]
]
tx_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
tx_pop_dea_dos_final.to_csv("../20_Intermediate_Files/Texas_Merged.csv", index=False)
tx_pop_dea_dos_final.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tx_pop_dea_dos_final.rename(


Unnamed: 0,Year,State Code,County,County.1,Population,Deaths,Dosage
0,1990,OK,"Caddo County, OK",40015,29421.0,,0.0
1,1990,OK,"Canadian County, OK",40017,74662.0,,0.0
2,1990,OK,"Carter County, OK",40019,42899.0,,0.0
3,1990,OK,"Cherokee County, OK",40021,34170.0,,0.0
4,1990,OK,"Choctaw County, OK",40023,15317.0,,0.0
