## Merge ARCOS, Population, and Mortality Data

Combine datasets to create county-year panel:
- ARCOS prescription data (MME totals)
- Population estimates  
- Drug overdose deaths

### Import Libraries

In [14]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

### Load ARCOS Data

In [15]:
arcos = pd.read_csv("../01_data/clean/arcos_end_use_only.csv")
arcos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6879 entries, 0 to 6878
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   BUYER_STATE   6879 non-null   object 
 1   BUYER_COUNTY  6879 non-null   object 
 2   YEAR          6879 non-null   int64  
 3   TOTAL_MME     6879 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 215.1+ KB


In [16]:
arcos.isna().sum()

BUYER_STATE     0
BUYER_COUNTY    0
YEAR            0
TOTAL_MME       0
dtype: int64

In [17]:
# Create merge key
arcos["merge_on_arcos"] = (
    arcos["BUYER_COUNTY"].str.upper() + " COUNTY, " + arcos["BUYER_STATE"]
)

In [18]:
arcos["merge_on_arcos"].sample(5)

4852     SCOTLAND COUNTY, NC
1054    CHATTOOGA COUNTY, GA
338       JACKSON COUNTY, FL
4331       GASTON COUNTY, NC
2173      SCREVEN COUNTY, GA
Name: merge_on_arcos, dtype: object

### Load Population and Mortality Data

In [19]:
# Load population data
pop = pd.read_csv("../01_data/clean/population_2000_2024.csv")

# Filter to 2006-2015 to match ARCOS overlap period
pop_overlap = pop[(pop["year"] >= 2006) & (pop["year"] <= 2015)].copy()

pop_overlap.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6336 entries, 3456 to 9791
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   STATE       6336 non-null   int64 
 1   COUNTY      6336 non-null   int64 
 2   STNAME      6336 non-null   object
 3   CTYNAME     6336 non-null   object
 4   year        6336 non-null   int64 
 5   population  6336 non-null   int64 
 6   fips        6336 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 396.0+ KB


In [20]:
# Load mortality data separately
mort = pd.read_csv("../01_data/clean/merged_mortality_population.csv")
mort_filtered = mort[mort["Drug/Alcohol Induced Cause Code"] == "D1"].copy()

# Aggregate mortality by county-year
mort_aggregated = mort_filtered.groupby(
    ["County", "County Code", "year"], as_index=False
).agg(
    {
        "Deaths": "sum",
        "Drug/Alcohol Induced Cause": lambda x: "; ".join(x.unique()),
        "Drug/Alcohol Induced Cause Code": lambda x: ", ".join(sorted(x.unique())),
    }
)

mort_aggregated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1774 entries, 0 to 1773
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   County                           1774 non-null   object 
 1   County Code                      1774 non-null   int64  
 2   year                             1774 non-null   int64  
 3   Deaths                           1774 non-null   float64
 4   Drug/Alcohol Induced Cause       1774 non-null   object 
 5   Drug/Alcohol Induced Cause Code  1774 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 83.3+ KB


In [21]:
# Create merge keys
# Map full state names to abbreviations to match ARCOS format
state_abbrev = {
    "Florida": "FL",
    "Georgia": "GA",
    "Idaho": "ID",
    "Montana": "MT",
    "North Carolina": "NC",
    "Oregon": "OR",
    "South Carolina": "SC",
    "Utah": "UT",
    "Washington": "WA",
}

pop_overlap["STATE_ABBREV"] = pop_overlap["STNAME"].map(state_abbrev)
pop_overlap["merge_key"] = (
    pop_overlap["CTYNAME"].str.replace(" County", "").str.upper()
    + " COUNTY, "
    + pop_overlap["STATE_ABBREV"]
)
pop_overlap["County Code"] = pop_overlap["fips"].astype(str)

mort_aggregated["merge_key"] = mort_aggregated["County"].str.upper()
mort_aggregated["County Code"] = mort_aggregated["County Code"].astype(str)

In [22]:
# Merge population with mortality (LEFT join to keep all population)
pop_with_mort = pop_overlap.merge(
    mort_aggregated[
        [
            "County Code",
            "year",
            "Deaths",
            "Drug/Alcohol Induced Cause",
            "Drug/Alcohol Induced Cause Code",
        ]
    ],
    on=["County Code", "year"],
    how="left",
)

print(f"Population observations (2006-2015): {len(pop_overlap)}")
print(f"After merging with mortality: {len(pop_with_mort)}")
print(
    f"Observations with mortality data: {pop_with_mort['Deaths'].notna().sum()} ({pop_with_mort['Deaths'].notna().mean()*100:.1f}%)"
)

Population observations (2006-2015): 6336
After merging with mortality: 6336
Observations with mortality data: 1589 (25.1%)


In [23]:
# Filter ARCOS to overlap years (2006-2015)
arcos_overlap = arcos[(arcos["YEAR"] >= 2006) & (arcos["YEAR"] <= 2015)].copy()

# Merge population+mortality with ARCOS (LEFT join on population to keep all population observations)
merged = pop_with_mort.merge(
    arcos_overlap[
        ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME", "merge_on_arcos"]
    ],
    how="left",
    left_on=["merge_key", "year"],
    right_on=["merge_on_arcos", "YEAR"],
)

# Canonicalize year column and drop others
if "year" in merged.columns:
    merged["Year"] = merged["year"]
if "YEAR" in merged.columns:
    merged = merged.drop(columns=["YEAR"])
if "year" in merged.columns:
    merged = merged.drop(columns=["year"])

print(f"\nFinal merged observations: {len(merged)}")
print(
    f"Observations with ARCOS data: {merged['TOTAL_MME'].notna().sum()} ({merged['TOTAL_MME'].notna().mean()*100:.1f}%)"
)
print(
    f"Observations with mortality data: {merged['Deaths'].notna().sum()} ({merged['Deaths'].notna().mean()*100:.1f}%)"
)


Final merged observations: 6336
Observations with ARCOS data: 5363 (84.6%)
Observations with mortality data: 1589 (25.1%)


In [24]:
# Debug: Check merge keys
print("Sample population merge keys:")
print(pop_with_mort["merge_key"].head(10).tolist())
print("\nSample ARCOS merge keys:")
print(arcos_overlap["merge_on_arcos"].head(10).tolist())

# Check for matches
pop_keys = set(pop_with_mort["merge_key"].unique())
arcos_keys = set(arcos_overlap["merge_on_arcos"].unique())
print(f"\nPopulation unique keys: {len(pop_keys)}")
print(f"ARCOS unique keys: {len(arcos_keys)}")
print(f"Matching keys: {len(pop_keys & arcos_keys)}")

Sample population merge keys:
['ALACHUA COUNTY, FL', 'BAKER COUNTY, FL', 'BAY COUNTY, FL', 'BRADFORD COUNTY, FL', 'BREVARD COUNTY, FL', 'BROWARD COUNTY, FL', 'CALHOUN COUNTY, FL', 'CHARLOTTE COUNTY, FL', 'CITRUS COUNTY, FL', 'CLAY COUNTY, FL']

Sample ARCOS merge keys:
['ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'ALACHUA COUNTY, FL', 'BAKER COUNTY, FL', 'BAKER COUNTY, FL', 'BAKER COUNTY, FL']

Population unique keys: 576
ARCOS unique keys: 518
Matching keys: 514


In [25]:
merged.sample(5)

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,population,fips,STATE_ABBREV,merge_key,County Code,Deaths,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,BUYER_STATE,BUYER_COUNTY,TOTAL_MME,merge_on_arcos,Year
949,37,95,North Carolina,Hyde County,5639,37095,NC,"HYDE COUNTY, NC",37095,,,,,,,,2007
2397,13,55,Georgia,Chattooga County,26016,13055,GA,"CHATTOOGA COUNTY, GA",13055,,,,GA,CHATTOOGA,21515520.0,"CHATTOOGA COUNTY, GA",2010
4682,13,15,Georgia,Bartow County,100917,13015,GA,"BARTOW COUNTY, GA",13015,,,,GA,BARTOW,151591600.0,"BARTOW COUNTY, GA",2013
4156,13,117,Georgia,Forsyth County,186793,13117,GA,"FORSYTH COUNTY, GA",13117,13.0,Drug poisonings (overdose) Unintentional (X40-...,D1,,,,,2012
3748,30,45,Montana,Judith Basin County,2009,30045,MT,"JUDITH BASIN COUNTY, MT",30045,,,,MT,JUDITH BASIN,1056.165,"JUDITH BASIN COUNTY, MT",2011


In [26]:
# Save clean merged output
merged.to_csv("../01_data/clean/final_merged.csv", index=False)
print(f"Saved final_merged.csv with {len(merged)} rows")
print(f"Years covered: {merged['Year'].min()}-{merged['Year'].max()}")
print(f"Counties: {merged['CTYNAME'].nunique()}")

Saved final_merged.csv with 6336 rows
Years covered: 2006-2015
Counties: 473
