In [1]:
# Importing Libraries
import pandas as pd
import numpy as np

In [2]:
#######################################################

In [3]:
# Population Data
pop_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/population_cleaning/20_Intermediate_Files/Cleaned_Population_Data.csv"
pop_data = pd.read_csv(pop_data_path)
pop_data["Year"] = pop_data["Year"].str[:4]
pop_data["State_CD"] = pop_data["County"].str[-2:]
pop_data["County Code"] = pop_data["County Code"].astype(str)
pop_data.shape
# (97519, 7)
pop_data.head()

Unnamed: 0,Year,State,State Code,County,County Code,Population,Population_filled,State_CD
0,1990,Alabama,1,"Autauga County, AL",1001,34353.0,34353.0,AL
1,1990,Ohio,39,"Lorain County, OH",39093,271711.0,271711.0,OH
2,1990,Ohio,39,"Lucas County, OH",39095,462634.0,462634.0,OH
3,1990,Ohio,39,"Madison County, OH",39097,37111.0,37111.0,OH
4,1990,Ohio,39,"Mahoning County, OH",39099,265095.0,265095.0,OH


In [4]:
# Deaths Data
dea_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/death_cleaning/20_Intermediate_Files/Deaths.csv"
dea_data = pd.read_csv(dea_data_path)
dea_data["Year"] = dea_data["Year"].astype(str)
dea_data["County Code"] = dea_data["County Code"].astype(str)
dea_data.shape
# (7888, 5)
dea_data.head()

Unnamed: 0,Year,County Code,County,State,Deaths
0,2003,10001,"Kent County, DE",DE,10.0
1,2003,10003,"New Castle County, DE",DE,54.0
2,2003,1003,"Baldwin County, AL",AL,10.0
3,2003,1073,"Jefferson County, AL",AL,69.0
4,2003,1097,"Mobile County, AL",AL,26.0


In [5]:
# Dosages Data
dos_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/county_code_to_dosage_data/20_Intermediate_Files/Dosage_with_CountyCode.csv"
dos_data = pd.read_csv(dos_data_path)
dos_data["Year"] = dos_data["Year"].astype(str)
dos_data["fips"] = dos_data["fips"].fillna(0).astype(int).astype(str)
dos_data.shape
# (43152, 5)
dos_data.head()

Unnamed: 0,name,state,Year,MME,fips
0,abbeville,SC,2006,3136215.0,45001
1,abbeville,SC,2007,3232603.0,45001
2,abbeville,SC,2008,3070698.0,45001
3,abbeville,SC,2009,3827607.0,45001
4,abbeville,SC,2010,4612935.0,45001


In [6]:
# Creating list of Test and Control states
# For Florida (FL) - Kentucky (KY), Tennessee (TN), Oregon (OR)
fl_states = ["FL", "KY", "TN", "OR"]
# For Washington (WA) - Ohio (OH), Minnesota (MN), Arkansas (AR)
wa_states = ["WA", "OH", "MN", "AR"]
# For Texas (TX) - Ohio (OH), Kentucky (KY), Maine (ME)
tx_states = ["TX", "OH", "KY", "ME"]

# Florida

In [7]:
#######################################################

In [8]:
fl_pop_data_test_and_control = pop_data[pop_data["State_CD"].isin(fl_states)]
# Checks

# fl_pop_data_test_and_control["State"].unique()
# array(['Oregon', 'Tennessee', 'Florida', 'Kentucky'], dtype=object)

# fl_pop_data_test_and_control["State"].value_counts()
# State
# Kentucky     3720
# Tennessee    2945
# Florida      2077
# Oregon       1116
# Name: count, dtype: int64

# fl_pop_data_test_and_control.shape
# (9858, 8)

In [9]:
fl_dea_data_test_and_control = dea_data[dea_data["State"].isin(fl_states)]
# Checks

# fl_dea_data_test_and_control["State"].unique()
# array(['FL', 'KY', 'OR', 'TN'], dtype=object)

# fl_dea_data_test_and_control["State"].value_counts()
# State
# FL    458
# TN    273
# KY    252
# OR    102
# Name: count, dtype: int64

# fl_dea_data_test_and_control.shape
# (1085, 5)

In [10]:
fl_dos_data_test_and_control = dos_data[dos_data["state"].isin(fl_states)]
# Checks

# fl_dos_data_test_and_control["state"].unique()
# array(['KY', 'FL', 'TN', 'OR'], dtype=object)

# fl_dos_data_test_and_control["state"].value_counts()
# state
# KY    1666
# TN    1326
# FL     935
# OR     484
# Name: count, dtype: int64

# fl_dos_data_test_and_control.shape
# (4411, 5)

In [11]:
# Merging Population and Deaths Data
fl_pop_dea_merged = fl_pop_data_test_and_control.merge(
    fl_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

# fl_pop_data_test_and_control.shape
# (9858, 8)
# fl_pop_dea_merged.shape
# (9858, 11)
# fl_dea_data_test_and_control["Deaths"].sum()
# 50859.0
# fl_pop_dea_merged["Deaths"].sum()
# 50859.0

In [12]:
# Merging Population + Deaths and Dosages Data

fl_pop_dea_dos_merged = fl_pop_dea_merged.merge(
    fl_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "Year"],
)

# Checks
# fl_pop_dea_merged.shape
# (9858, 11)
# fl_pop_dea_dos_merged.shape
# (9858, 15)
# fl_dos_data_test_and_control["MME"].sum()
# 239764150784.0797
# fl_pop_dea_dos_merged["MME"].sum()
# 239764150784.0797

In [13]:
# Filling the Dosage Null Values with 0
fl_pop_dea_dos_merged["MME"] = fl_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
fl_pop_dea_dos = fl_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
].copy()
fl_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# fl_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = fl_pop_dea_dos[
    (fl_pop_dea_dos["Dosage"] > 0) & (fl_pop_dea_dos["Deaths"] > 0)
].copy()

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
state_year_df = (
    filtered_df.groupby(["State Code", "Year"])["Avg_Deaths_Per_Population"]
    .max()
    .reset_index()
)
# Merge the new column back to the original DataFrame
fl_pop_dea_dos = fl_pop_dea_dos.merge(
    state_year_df[["State Code", "Year", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
fl_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (fl_pop_dea_dos["Avg_Deaths_Per_Population"] * fl_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
fl_pop_dea_dos[
    (
        fl_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & fl_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

Deaths_Per_Population_Times_Population
3.0    432
2.0    387
9.0    320
4.0    296
1.0    229
5.0    220
6.0    158
7.0    119
8.0     93
0.0     34
Name: count, dtype: int64

In [14]:
fl_pop_dea_dos["Deaths"] = fl_pop_dea_dos["Deaths"].fillna(
    fl_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
fl_pop_dea_dos_final = fl_pop_dea_dos[
    ["Year", "State Code", "County", "County Code", "Population", "Deaths", "Dosage"]
].copy()
fl_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
fl_pop_dea_dos_final.head()

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage
0,1990,OR,"Baker County, OR",41001,15433.0,,0.0
1,1990,OR,"Benton County, OR",41003,71059.0,,0.0
2,1990,OR,"Clackamas County, OR",41005,280862.0,,0.0
3,1990,OR,"Tillamook County, OR",41057,21638.0,,0.0
4,1990,OR,"Umatilla County, OR",41059,59433.0,,0.0


In [15]:
# Checking for duplicates in the final dataset
duplicates = fl_pop_dea_dos.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
fl_pop_dea_dos[duplicates]
# 0 rows means no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage,Avg_Deaths_Per_Population,Deaths_Per_Population_Times_Population


In [16]:
fl_pop_dea_dos_final.shape
# (9858, 7)
# fl_pop_dea_dos_final["Deaths"].sum()
# 60847.0
# fl_pop_dea_dos_final["Dosage"].sum()
# 239764150784.07974
# fl_pop_dea_dos_final["Population"].sum()
# 968254762.0

(9858, 7)

In [17]:
duplicates = fl_pop_dea_dos_final.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
fl_pop_dea_dos_final[duplicates]
# 0 rows means there are no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [18]:
# Subsetting for data upto 2015, as we only have deaths data up to 2015
fl_pop_dea_dos_final = fl_pop_dea_dos_final[~(fl_pop_dea_dos_final["Year"] > "2015")]

In [19]:
fl_pop_dea_dos_final[
    (fl_pop_dea_dos_final["Deaths"].isnull())
    & (fl_pop_dea_dos_final["Dosage"].notnull())
    & (fl_pop_dea_dos_final["Dosage"] > 0)
]
# 0 rows means no null values in Deaths column where there is data in the dosage column - so the data imputation has been performed correctly

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [20]:
fl_pop_dea_dos_final.to_csv("../20_Intermediate_Files/Florida_Merged.csv", index=False)

# Washington

In [21]:
#######################################################

In [22]:
wa_pop_data_test_and_control = pop_data[pop_data["State_CD"].isin(wa_states)]

# Checks


# wa_pop_data_test_and_control["State"].unique()
# array(['Ohio', 'Washington', 'Arkansas', 'Minnesota'], dtype=object)


# wa_pop_data_test_and_control["State"].value_counts()
# State
# Ohio          2728
# Minnesota     2697
# Arkansas      2325
# Washington    1209
# Name: count, dtype: int64

# wa_pop_data_test_and_control.shape
# (8969, 8)

In [23]:
wa_dea_data_test_and_control = dea_data[dea_data["State"].isin(wa_states)]

# Checks
# wa_dea_data_test_and_control["State"].unique()
# array(['MN', 'OH', 'AR', 'WA'], dtype=object)


# wa_dea_data_test_and_control["State"].value_counts()
# State
# OH    412
# WA    198
# MN     77
# AR     71
# Name: count, dtype: int64

# wa_dea_data_test_and_control.shape
# (758, 5)

In [24]:
wa_dos_data_test_and_control = dos_data[dos_data["state"].isin(wa_states)]

# Checks
# wa_dos_data_test_and_control["state"].unique()
# array(['OH', 'WA', 'MN', 'AR'], dtype=object)

# wa_dos_data_test_and_control["state"].value_counts()
# state
# OH    1232
# MN    1218
# AR    1046
# WA     546
# Name: count, dtype: int64

# wa_dos_data_test_and_control.shape
# (4042, 5)

In [25]:
# Merging Population and Deaths Data
wa_pop_dea_merged = wa_pop_data_test_and_control.merge(
    wa_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

# wa_pop_data_test_and_control.shape
# (8959, 8)
# wa_pop_dea_merged.shape
# (8959, 11)
# wa_dea_data_test_and_control["Deaths"].sum()
# 33870.0
# wa_pop_dea_merged["Deaths"].sum()
# 33870.0

In [26]:
# Merging Population + Deaths and Dosages Data

wa_pop_dea_dos_merged = wa_pop_dea_merged.merge(
    wa_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "Year"],
)

# Checks
# wa_pop_dea_merged.shape
# (8959, 11)
# wa_pop_dea_dos_merged.shape
# (8959, 15)
# wa_dos_data_test_and_control["MME"].sum()
# 119025022592.69534
# wa_pop_dea_dos_merged["MME"].sum()
# 119025022592.69534

In [27]:
# Filling the Dosage Null Values with 0
wa_pop_dea_dos_merged["MME"] = wa_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
wa_pop_dea_dos = wa_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
].copy()
wa_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# wa_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = wa_pop_dea_dos[
    (wa_pop_dea_dos["Dosage"] > 0) & (wa_pop_dea_dos["Deaths"] > 0)
].copy()

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
state_year_df = (
    filtered_df.groupby(["State Code", "Year"])["Avg_Deaths_Per_Population"]
    .max()
    .reset_index()
)
# Merge the new column back to the original DataFrame
wa_pop_dea_dos = wa_pop_dea_dos.merge(
    state_year_df[["State Code", "Year", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
wa_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (wa_pop_dea_dos["Avg_Deaths_Per_Population"] * wa_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
wa_pop_dea_dos[
    (
        wa_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & wa_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

Deaths_Per_Population_Times_Population
1.0    562
2.0    446
9.0    303
3.0    253
4.0    174
6.0    134
5.0    130
7.0     94
0.0     81
8.0     71
Name: count, dtype: int64

In [28]:
wa_pop_dea_dos["Deaths"] = wa_pop_dea_dos["Deaths"].fillna(
    wa_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
wa_pop_dea_dos_final = wa_pop_dea_dos[
    ["Year", "State Code", "County", "County Code", "Population", "Deaths", "Dosage"]
].copy()
wa_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
wa_pop_dea_dos_final.head()

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage
0,1990,OH,"Lorain County, OH",39093,271711.0,,0.0
1,1990,OH,"Lucas County, OH",39095,462634.0,,0.0
2,1990,OH,"Madison County, OH",39097,37111.0,,0.0
3,1990,OH,"Mahoning County, OH",39099,265095.0,,0.0
4,1990,OH,"Marion County, OH",39101,64289.0,,0.0


In [29]:
# Checking for duplicates in the final dataset
duplicates = wa_pop_dea_dos.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
wa_pop_dea_dos[duplicates]
# 0 rows means no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage,Avg_Deaths_Per_Population,Deaths_Per_Population_Times_Population


In [30]:
wa_pop_dea_dos_final.shape
# (8959, 7)
# wa_pop_dea_dos_final["Deaths"].sum()
# 42186.0
# wa_pop_dea_dos_final["Dosage"].sum()
# 119025022592.69534
# wa_pop_dea_dos_final["Population"].sum()
# 793122268.0

(8959, 7)

In [31]:
duplicates = wa_pop_dea_dos_final.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
wa_pop_dea_dos_final[duplicates]
# 0 rows means there are no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [32]:
# Subsetting for data upto 2015, as we only have deaths data up to 2015
wa_pop_dea_dos_final = wa_pop_dea_dos_final[~(wa_pop_dea_dos_final["Year"] > "2015")]

In [33]:
wa_pop_dea_dos_final[
    (wa_pop_dea_dos_final["Deaths"].isnull())
    & (wa_pop_dea_dos_final["Dosage"].notnull())
    & (wa_pop_dea_dos_final["Dosage"] > 0)
]
# 0 rows means no null values in Deaths column where there is data in the dosage column - so the data imputation has been performed correctly

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [34]:
wa_pop_dea_dos_final.to_csv(
    "../20_Intermediate_Files/Washington_Merged.csv", index=False
)

# Texas

In [35]:
#######################################################

In [36]:
tx_pop_data_test_and_control = pop_data[pop_data["State_CD"].isin(tx_states)]

# Checks

# tx_pop_data_test_and_control["State"].unique()
# array(['Ohio', 'Texas', 'Maine', 'Kentucky'], dtype=object)

# tx_pop_data_test_and_control["State"].value_counts()
# State
# Texas       7874
# Kentucky    3720
# Ohio        2728
# Maine        496
# Name: count, dtype: int64

# tx_pop_data_test_and_control.shape
# (14818, 8)

In [37]:
tx_dea_data_test_and_control = dea_data[dea_data["State"].isin(tx_states)]

# Checks
# tx_dea_data_test_and_control["State"].unique()
# array(['KY', 'ME', 'OH', 'TX'], dtype=object)

# tx_dea_data_test_and_control["State"].value_counts()
# State
# TX    432
# OH    412
# KY    252
# ME     58
# Name: count, dtype: int64

# tx_dea_data_test_and_control.shape
# (1154, 5)

In [38]:
tx_dos_data_test_and_control = dos_data[dos_data["state"].isin(tx_states)]

# Checks
# tx_dos_data_test_and_control["state"].unique()
# array(['KY', 'OH', 'TX', 'ME'], dtype=object)

# tx_dos_data_test_and_control["state"].value_counts()
# state
# TX    3160
# KY    1666
# OH    1232
# ME     224
# Name: count, dtype: int64

# tx_dos_data_test_and_control.shape
# (6282, 5)

In [39]:
# Merging Population and Deaths Data
tx_pop_dea_merged = tx_pop_data_test_and_control.merge(
    tx_dea_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["State", "County Code", "Year"],
)
# Checks

# tx_pop_data_test_and_control.shape
# (14818, 8)
# tx_pop_dea_merged.shape
# (14818, 11)
# tx_dea_data_test_and_control["Deaths"].sum()
# 49373.0
# tx_pop_dea_merged["Deaths"].sum()
# 49373.0

In [40]:
# Merging Population + Deaths and Dosages Data

tx_pop_dea_dos_merged = tx_pop_dea_merged.merge(
    tx_dos_data_test_and_control,
    how="left",
    left_on=["State_CD", "County Code", "Year"],
    right_on=["state", "fips", "Year"],
)

# Checks
# tx_pop_dea_merged.shape
# (14818, 11)
# tx_pop_dea_dos_merged.shape
# (14818, 15)
# tx_dos_data_test_and_control["MME"].sum()
# 160148299865.21362
# tx_pop_dea_dos_merged["MME"].sum()
# 160148299865.21362

In [41]:
# Filling the Dosage Null Values with 0
tx_pop_dea_dos_merged["MME"] = tx_pop_dea_dos_merged["MME"].fillna(0)
# For the Deaths data, we will impute the mortality rate for each county by taking the average of the mortality rate for the state in that year for counties with available data
tx_pop_dea_dos = tx_pop_dea_dos_merged[
    [
        "Year",
        "State_CD",
        "County_x",
        "County Code",
        "Population_filled",
        "Deaths",
        "MME",
    ]
].copy()
tx_pop_dea_dos.rename(
    columns={
        "County_x": "County",
        "State_CD": "State Code",
        "Population_filled": "Population",
        "MME": "Dosage",
    },
    inplace=True,
)
# tx_pop_dea_dos.head()
# Filter rows where Dosage > 0
filtered_df = tx_pop_dea_dos[
    (tx_pop_dea_dos["Dosage"] > 0) & (tx_pop_dea_dos["Deaths"] > 0)
].copy()

# Calculate average deaths per population for each state per year
filtered_df["Avg_Deaths_Per_Population"] = filtered_df.groupby(["State Code", "Year"])[
    "Deaths"
].transform("sum") / filtered_df.groupby(["State Code", "Year"])[
    "Population"
].transform(
    "sum"
)
state_year_df = (
    filtered_df.groupby(["State Code", "Year"])["Avg_Deaths_Per_Population"]
    .max()
    .reset_index()
)
# Merge the new column back to the original DataFrame
tx_pop_dea_dos = tx_pop_dea_dos.merge(
    state_year_df[["State Code", "Year", "Avg_Deaths_Per_Population"]],
    on=["State Code", "Year"],
    how="left",
)
tx_pop_dea_dos["Deaths_Per_Population_Times_Population"] = (
    (tx_pop_dea_dos["Avg_Deaths_Per_Population"] * tx_pop_dea_dos["Population"]).round(
        0
    )
).clip(upper=9)
# Check
tx_pop_dea_dos[
    (
        tx_pop_dea_dos["Deaths_Per_Population_Times_Population"].notnull()
        & tx_pop_dea_dos["Deaths"].isnull()
    )
]["Deaths_Per_Population_Times_Population"].value_counts()

Deaths_Per_Population_Times_Population
1.0    730
2.0    591
0.0    537
9.0    425
3.0    420
4.0    361
5.0    268
6.0    192
7.0    160
8.0    126
Name: count, dtype: int64

In [42]:
tx_pop_dea_dos["Deaths"] = tx_pop_dea_dos["Deaths"].fillna(
    tx_pop_dea_dos["Deaths_Per_Population_Times_Population"]
)
tx_pop_dea_dos_final = tx_pop_dea_dos[
    ["Year", "State Code", "County", "County Code", "Population", "Deaths", "Dosage"]
].copy()
tx_pop_dea_dos_final.rename(
    columns={
        "County Code_x": "County",
    },
    inplace=True,
)
tx_pop_dea_dos_final.head()

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage
0,1990,OH,"Lorain County, OH",39093,271711.0,,0.0
1,1990,OH,"Lucas County, OH",39095,462634.0,,0.0
2,1990,OH,"Madison County, OH",39097,37111.0,,0.0
3,1990,OH,"Mahoning County, OH",39099,265095.0,,0.0
4,1990,OH,"Marion County, OH",39101,64289.0,,0.0


In [43]:
# Checking for duplicates in the final dataset
duplicates = tx_pop_dea_dos.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
tx_pop_dea_dos[duplicates]
# 0 rows means no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage,Avg_Deaths_Per_Population,Deaths_Per_Population_Times_Population


In [44]:
tx_pop_dea_dos_final.shape
# (14818, 7)
# tx_pop_dea_dos_final["Deaths"].sum()
# 62434.0
# tx_pop_dea_dos_final["Dosage"].sum()
# 160148299865.21362
# tx_pop_dea_dos_final["Population"].sum()
# 1239297690.0

(14818, 7)

In [45]:
duplicates = tx_pop_dea_dos_final.duplicated(
    subset=["Year", "State Code", "County"], keep=False
)
tx_pop_dea_dos_final[duplicates]
# 0 rows means there are no duplicates :)

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [46]:
# Subsetting for data upto 2015, as we only have deaths data up to 2015
tx_pop_dea_dos_final = tx_pop_dea_dos_final[~(tx_pop_dea_dos_final["Year"] > "2015")]

In [47]:
wa_pop_dea_dos_final[
    (wa_pop_dea_dos_final["Deaths"].isnull())
    & (wa_pop_dea_dos_final["Dosage"].notnull())
    & (wa_pop_dea_dos_final["Dosage"] > 0)
]
# 0 rows means no null values in Deaths column where there is data in the dosage column - so the data imputation has been performed correctly

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [48]:
tx_pop_dea_dos_final[
    (tx_pop_dea_dos_final["Deaths"].isnull())
    & (tx_pop_dea_dos_final["Dosage"].notnull())
    & (tx_pop_dea_dos_final["Dosage"] > 0)
]
# 0 rows means no null values in Deaths column where there is data in the dosage column - so the data imputation has been performed correctly

Unnamed: 0,Year,State Code,County,County Code,Population,Deaths,Dosage


In [49]:
tx_pop_dea_dos_final.to_csv("../20_Intermediate_Files/Texas_Merged.csv", index=False)