## Merge ARCOS, Population, and Mortality Data

Combine datasets to create county-year panel:
- ARCOS prescription data (MME totals)
- Population estimates  
- Drug overdose deaths

### Import Libraries

In [1]:
# Imports and display settings
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

### Load ARCOS Data

In [2]:
# Load ARCOS data and inspect structure
arcos = pd.read_csv("../01_data/clean/arcos_end_use_only.csv")
arcos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9948 entries, 0 to 9947
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   BUYER_STATE   9948 non-null   object 
 1   BUYER_COUNTY  9948 non-null   object 
 2   YEAR          9948 non-null   int64  
 3   TOTAL_MME     9948 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 311.0+ KB


In [3]:
# Check for missing values in ARCOS data
arcos.isna().sum()

BUYER_STATE     0
BUYER_COUNTY    0
YEAR            0
TOTAL_MME       0
dtype: int64

In [4]:
# Create merge key for ARCOS data
arcos["merge_on_arcos"] = (
    arcos["BUYER_COUNTY"].str.upper() + " COUNTY, " + arcos["BUYER_STATE"]
)

In [5]:
# Show sample merge keys from ARCOS data
arcos["merge_on_arcos"].sample(5)

1650     LIBERTY COUNTY, FL
4060      CUSTER COUNTY, ID
8058       HORRY COUNTY, SC
9849     SPOKANE COUNTY, WA
6101    TREASURE COUNTY, MT
Name: merge_on_arcos, dtype: object

### Load Population and Mortality Data

In [6]:
# Load and filter population data to ARCOS overlap years
pop = pd.read_csv("../01_data/clean/population_2000_2024.csv")
pop_overlap = pop[(pop["year"] >= 2006) & (pop["year"] <= 2015)].copy()
pop_overlap.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9306 entries, 5076 to 14381
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   STATE       9306 non-null   int64 
 1   COUNTY      9306 non-null   int64 
 2   STNAME      9306 non-null   object
 3   CTYNAME     9306 non-null   object
 4   year        9306 non-null   int64 
 5   population  9306 non-null   int64 
 6   fips        9306 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 581.6+ KB


In [7]:
# Load and aggregate mortality data by county-year
mort = pd.read_csv("../01_data/clean/merged_mortality_population.csv")
mort_filtered = mort[mort["Drug/Alcohol Induced Cause Code"] == "D1"].copy()
mort_aggregated = mort_filtered.groupby(
    ["County", "County Code", "year"], as_index=False
).agg(
    {
        "Deaths": "sum",
        "Drug/Alcohol Induced Cause": lambda x: "; ".join(x.unique()),
        "Drug/Alcohol Induced Cause Code": lambda x: ", ".join(sorted(x.unique())),
    }
)
mort_aggregated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2049 entries, 0 to 2048
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   County                           2049 non-null   object 
 1   County Code                      2049 non-null   int64  
 2   year                             2049 non-null   int64  
 3   Deaths                           2049 non-null   float64
 4   Drug/Alcohol Induced Cause       2049 non-null   object 
 5   Drug/Alcohol Induced Cause Code  2049 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 96.2+ KB


In [8]:
# Create merge keys for population and mortality data
state_abbrev = {
    "Florida": "FL",
    "Washington": "WA",
    "North Carolina": "NC",
    "Georgia": "GA",
    "Oregon": "OR",
    "South Carolina": "SC",
    "Idaho": "ID",
    "Montana": "MT",
    "Tennessee": "TN",
    "Mississippi": "MS",
    "Colorado": "CO",
    "California": "CA",
}

pop_overlap["STATE_ABBREV"] = pop_overlap["STNAME"].map(state_abbrev)
pop_overlap["merge_key"] = (
    pop_overlap["CTYNAME"].str.replace(" County", "").str.upper()
    + " COUNTY, "
    + pop_overlap["STATE_ABBREV"]
)
pop_overlap["County Code"] = pop_overlap["fips"].astype(str)

mort_aggregated["merge_key"] = mort_aggregated["County"].str.upper()
mort_aggregated["County Code"] = mort_aggregated["County Code"].astype(str)

In [9]:
# Merge population with mortality data
pop_with_mort = pop_overlap.merge(
    mort_aggregated[
        [
            "County Code",
            "year",
            "Deaths",
            "Drug/Alcohol Induced Cause",
            "Drug/Alcohol Induced Cause Code",
        ]
    ],
    on=["County Code", "year"],
    how="left",
)

print(f"Population observations (2006-2015): {len(pop_overlap)}")
print(f"After merging with mortality: {len(pop_with_mort)}")
print(
    f"Observations with mortality data: {pop_with_mort['Deaths'].notna().sum()} ({pop_with_mort['Deaths'].notna().mean()*100:.1f}%)"
)

Population observations (2006-2015): 9306
After merging with mortality: 9306
Observations with mortality data: 1849 (19.9%)


In [10]:
# Merge population+mortality with ARCOS data for overlap years
arcos_overlap = arcos[(arcos["YEAR"] >= 2006) & (arcos["YEAR"] <= 2015)].copy()
merged = pop_with_mort.merge(
    arcos_overlap[
        ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME", "merge_on_arcos"]
    ],
    how="left",
    left_on=["merge_key", "year"],
    right_on=["merge_on_arcos", "YEAR"],
)

# Canonicalize year column and drop others
if "year" in merged.columns:
    merged["Year"] = merged["year"]
if "YEAR" in merged.columns:
    merged = merged.drop(columns=["YEAR"])
if "year" in merged.columns:
    merged = merged.drop(columns=["year"])

print(f"\nFinal merged observations: {len(merged)}")
print(
    f"Observations with ARCOS data: {merged['TOTAL_MME'].notna().sum()} ({merged['TOTAL_MME'].notna().mean()*100:.1f}%)"
)
print(
    f"Observations with mortality data: {merged['Deaths'].notna().sum()} ({merged['Deaths'].notna().mean()*100:.1f}%)"
)


Final merged observations: 9306
Observations with ARCOS data: 7774 (83.5%)
Observations with mortality data: 1849 (19.9%)


In [11]:
# Debug: Check merge keys and matches
print("Sample population merge keys:")
print(pop_with_mort["merge_key"].head(10).tolist())
print("\nSample ARCOS merge keys:")
print(arcos_overlap["merge_on_arcos"].head(10).tolist())

pop_keys = set(pop_with_mort["merge_key"].unique())
arcos_keys = set(arcos_overlap["merge_on_arcos"].unique())
print(f"\nPopulation unique keys: {len(pop_keys)}")
print(f"ARCOS unique keys: {len(arcos_keys)}")
print(f"Matching keys: {len(pop_keys & arcos_keys)}")

Sample population merge keys:
['ALAMEDA COUNTY, CA', 'ALPINE COUNTY, CA', 'AMADOR COUNTY, CA', 'BUTTE COUNTY, CA', 'CALAVERAS COUNTY, CA', 'COLUSA COUNTY, CA', 'CONTRA COSTA COUNTY, CA', 'DEL NORTE COUNTY, CA', 'EL DORADO COUNTY, CA', 'FRESNO COUNTY, CA']

Sample ARCOS merge keys:
['ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'ALPINE COUNTY, CA', 'AMADOR COUNTY, CA', 'AMADOR COUNTY, CA', 'AMADOR COUNTY, CA']

Population unique keys: 846
ARCOS unique keys: 752
Matching keys: 748


In [12]:
# Show a sample of the merged dataset
merged.sample(5)

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,population,fips,STATE_ABBREV,merge_key,County Code,Deaths,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,BUYER_STATE,BUYER_COUNTY,TOTAL_MME,merge_on_arcos,Year
1136,13,207,Georgia,Monroe County,25726,13207,GA,"MONROE COUNTY, GA",13207,,,,GA,MONROE,12696860.0,"MONROE COUNTY, GA",2007
4038,41,49,Oregon,Morrow County,11207,41049,OR,"MORROW COUNTY, OR",41049,,,,OR,MORROW,2425501.0,"MORROW COUNTY, OR",2010
4273,6,87,California,Santa Cruz County,263147,6087,CA,"SANTA CRUZ COUNTY, CA",6087,,,,CA,SANTA CRUZ,287541000.0,"SANTA CRUZ COUNTY, CA",2010
4976,47,69,Tennessee,Hardeman County,27169,47069,TN,"HARDEMAN COUNTY, TN",47069,,,,TN,HARDEMAN,12391560.0,"HARDEMAN COUNTY, TN",2010
7821,13,37,Georgia,Calhoun County,6506,13037,GA,"CALHOUN COUNTY, GA",13037,,,,GA,CALHOUN,1741602.0,"CALHOUN COUNTY, GA",2014


In [13]:
# Save the final merged dataset
merged.to_csv("../01_data/clean/final_merged.csv", index=False)
print(f"Saved final_merged.csv with {len(merged)} rows")
print(f"Years covered: {merged['Year'].min()}-{merged['Year'].max()}")
print(f"Counties: {merged['CTYNAME'].nunique()}")

Saved final_merged.csv with 9306 rows
Years covered: 2006-2015
Counties: 646
