# Investigate Extreme Values in ARCOS Data

Identify and filter out non-patient-level transactions (manufacturers, distributors) that create extreme outliers in per-capita opioid metrics.

## Setup

In [1]:
# Imports and display settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("mode.copy_on_write", True)
pd.set_option("display.max_columns", None)

In [2]:
# Load ARCOS data and basic inspection
arcos = pd.read_csv("../01_data/clean/arcos_by_county_year.csv")

# Show basic info about the ARCOS dataset
print(f"ARCOS data shape: {arcos.shape}")
print(f"\nColumns: {arcos.columns.tolist()}")
print(f"\nYear range: {arcos['YEAR'].min()} to {arcos['YEAR'].max()}")
print(f"States: {sorted(arcos['BUYER_STATE'].unique())}")

arcos.head()

ARCOS data shape: (11567, 6)

Columns: ['BUYER_STATE', 'BUYER_COUNTY', 'YEAR', 'TOTAL_MME', 'BUYER_BUS_ACT', 'unique_buyers']

Year range: 2006 to 2019
States: ['CA', 'CO', 'FL', 'GA', 'ID', 'MS', 'MT', 'NC', 'OR', 'SC', 'TN', 'WA']


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,BUYER_BUS_ACT,unique_buyers
0,CA,ALAMEDA,2006,22629610000.0,"ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",11603
1,CA,ALAMEDA,2007,24532530000.0,"ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",12060
2,CA,ALAMEDA,2008,17947300000.0,"ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",12055
3,CA,ALAMEDA,2009,25966890000.0,"ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",10702
4,CA,ALAMEDA,2010,20788340000.0,"ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",10520


## Load Population Data

In [3]:
# Load and clean population data
pop = pd.read_csv("../01_data/clean/population_2000_2024.csv")

# Map state names to abbreviations
pop["state_abbr"] = pop["STNAME"].map(
    {
        "Florida": "FL",
        "Washington": "WA",
        "North Carolina": "NC",
        "Georgia": "GA",
        "Oregon": "OR",
        "South Carolina": "SC",
        "Idaho": "ID",
        "Montana": "MT",
        "Tennessee": "TN",
        "Mississippi": "MS",
        "Colorado": "CO",
        "California": "CA",
    }
)

# Clean county names to match ARCOS format
pop["county_clean"] = (
    pop["CTYNAME"].str.upper().str.replace(" COUNTY", "").str.replace(" PARISH", "")
)

print(f"Population data shape: {pop.shape}")

Population data shape: (21996, 9)


## Calculate Per-Capita MME by County-Year

In [4]:
# Merge ARCOS and population data, then calculate per-capita MME
df = arcos.merge(
    pop[["state_abbr", "county_clean", "year", "population"]],
    left_on=["BUYER_STATE", "BUYER_COUNTY", "YEAR"],
    right_on=["state_abbr", "county_clean", "year"],
    how="left",
)

print(f"Merged data shape: {df.shape}")
print(f"Records without population match: {df['population'].isna().sum()}")

df["MME_per_capita"] = df["TOTAL_MME"] / df["population"]

print(f"\nPer-capita MME statistics:")
print(df["MME_per_capita"].describe())

Merged data shape: (12393, 10)
Records without population match: 46

Per-capita MME statistics:
count    1.234700e+04
mean     9.872001e+05
std      3.737649e+07
min      1.141389e-04
25%      5.592613e+02
50%      9.134193e+02
75%      1.488566e+03
max      2.534742e+09
Name: MME_per_capita, dtype: float64


## Identify Extreme Outliers

Counties with >100,000 mg MME per capita annually indicate data quality issues (likely manufacturer/distributor transactions).

In [5]:
# Identify extreme per-capita MME values and inspect them
df["is_extreme"] = df["MME_per_capita"] > 100000
extreme = df[df["is_extreme"]].copy()

print(f"Extreme observations: {len(extreme)} ({100*len(extreme)/len(df):.2f}%)")
print(f"\nTop 10 extreme values:")

extreme[
    ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME", "population", "MME_per_capita"]
].sort_values("MME_per_capita", ascending=False).head(10)

Extreme observations: 119 (0.96%)

Top 10 extreme values:


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,population,MME_per_capita
9122,NC,WILSON,2010,206059300000000.0,81294.0,2534742000.0
9121,NC,WILSON,2010,206059300000000.0,81359.0,2532717000.0
9127,NC,WILSON,2015,96750170000000.0,81240.0,1190918000.0
3744,GA,HALL,2009,197694900000000.0,178503.0,1107516000.0
8765,NC,PITT,2013,127358500000000.0,174380.0,730350300.0
3747,GA,HALL,2011,117879600000000.0,182269.0,646734000.0
1859,FL,BROWARD,2014,740037000000000.0,1860933.0,397669900.0
3748,GA,HALL,2012,62338490000000.0,184102.0,338608500.0
3746,GA,HALL,2010,57921740000000.0,180033.0,321728500.0
3745,GA,HALL,2010,57921740000000.0,180253.0,321335800.0


## Analyze Buyer Type Patterns

Compare buyer business activities between extreme and normal counties to identify root cause.

In [6]:
# Compare buyer patterns in extreme vs normal counties
print("Buyer patterns in extreme vs normal counties:\n")
print("Top 10 extreme counties:")
print(
    extreme.nlargest(10, "MME_per_capita")[
        ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "MME_per_capita", "BUYER_BUS_ACT"]
    ]
)

print("\nExtreme buyer activity distribution:")
print(extreme["BUYER_BUS_ACT"].value_counts())

Buyer patterns in extreme vs normal counties:

Top 10 extreme counties:
     BUYER_STATE BUYER_COUNTY  YEAR  MME_per_capita  \
9122          NC       WILSON  2010    2.534742e+09   
9121          NC       WILSON  2010    2.532717e+09   
9127          NC       WILSON  2015    1.190918e+09   
3744          GA         HALL  2009    1.107516e+09   
8765          NC         PITT  2013    7.303503e+08   
3747          GA         HALL  2011    6.467340e+08   
1859          FL      BROWARD  2014    3.976699e+08   
3748          GA         HALL  2012    3.386085e+08   
3746          GA         HALL  2010    3.217285e+08   
3745          GA         HALL  2010    3.213358e+08   

                                          BUYER_BUS_ACT  
9122  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
9121  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
9127  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
3744  ANALYTICAL LAB,CHAIN PHARMACY,EXPORTER,HOSPITA...  
8765  ANALYTICAL LAB,CHAIN PHARM

## Verify Population Data Quality

Check if extreme values could be due to incorrect population figures.

In [7]:
# Compare population distributions and identify small-population extreme counties
print("Population comparison:")
print(f"\nExtreme counties population stats:")
print(extreme["population"].describe())

print(f"\nNormal counties population stats:")
print(df[~df["is_extreme"]]["population"].describe())

# Find extreme counties with smallest populations
print("\nExtreme counties with smallest populations:")
print(
    extreme.nsmallest(10, "population")[
        [
            "BUYER_STATE",
            "BUYER_COUNTY",
            "YEAR",
            "population",
            "TOTAL_MME",
            "MME_per_capita",
        ]
    ]
)

Population comparison:

Extreme counties population stats:
count    1.190000e+02
mean     5.946132e+05
std      6.114706e+05
min      4.562500e+04
25%      1.222245e+05
50%      2.289400e+05
75%      9.287385e+05
max      2.109712e+06
Name: population, dtype: float64

Normal counties population stats:
count    1.222800e+04
mean     1.285959e+05
std      4.474475e+05
min      6.490000e+02
25%      1.422175e+04
50%      2.944050e+04
75%      8.735900e+04
max      1.010571e+07
Name: population, dtype: float64

Extreme counties with smallest populations:
      BUYER_STATE BUYER_COUNTY  YEAR  population     TOTAL_MME  MME_per_capita
11309          TN       MONROE  2015     45625.0  6.661352e+09    1.460022e+05
11311          TN       MONROE  2017     46035.0  7.289157e+09    1.583395e+05
969            CO   BROOMFIELD  2006     48729.0  3.165725e+10    6.496593e+05
970            CO   BROOMFIELD  2007     51526.0  3.228403e+10    6.265580e+05
971            CO   BROOMFIELD  2008     53181.0

## Filter to Patient-Level Transactions Only

Remove upstream supply chain entities (manufacturers, distributors) to retain only patient-level dispensing.

In [8]:
# List patient-level buyer types
end_use_buyers = [
    "RETAIL PHARMACY",
    "CHAIN PHARMACY",
    "HOSPITAL/CLINIC",
    "HOSP/CLINIC-VA",
    "PRACTITIONER",
    "MLP-PHYSICIAN ASSISTANT",
    "MLP-NURSE PRACTITIONER",
    "MLP-CLINICAL NURSE SPECIALIST",
    "MLP-CERTIFIED NURSE MIDWIFE",
    "MLP-NURSE ANESTHETIST",
    "TEACHING INSTITUTION",
]

print("Patient-level buyer types:")
for b in end_use_buyers:
    print(f"  {b}")

Patient-level buyer types:
  RETAIL PHARMACY
  CHAIN PHARMACY
  HOSPITAL/CLINIC
  HOSP/CLINIC-VA
  PRACTITIONER
  MLP-PHYSICIAN ASSISTANT
  MLP-NURSE PRACTITIONER
  MLP-CLINICAL NURSE SPECIALIST
  MLP-CERTIFIED NURSE MIDWIFE
  MLP-NURSE ANESTHETIST
  TEACHING INSTITUTION


In [9]:
# Remove non-patient-level buyer types and keep essential columns
upstream_buyers = [
    "MANUFACTURER",
    "DISTRIBUTOR",
    "EXPORTER",
    "ANALYTICAL LAB",
    "CANINE HANDLER",
]

arcos_filtered = arcos[
    ~arcos["BUYER_BUS_ACT"].str.contains(
        "|".join(upstream_buyers), regex=True, na=False
    )
].copy()

arcos_filtered = arcos_filtered[["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME"]]

print(f"Before filtering: {len(arcos):,} county-years")
print(f"After filtering: {len(arcos_filtered):,} county-years")
print(
    f"Removed: {len(arcos) - len(arcos_filtered):,} ({100*(len(arcos) - len(arcos_filtered))/len(arcos):.1f}%)"
)

arcos_filtered.head(10)

Before filtering: 11,567 county-years
After filtering: 9,948 county-years
Removed: 1,619 (14.0%)


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME
14,CA,ALPINE,2006,454.05
15,CA,ALPINE,2008,605.4
16,CA,ALPINE,2009,11386.97
17,CA,ALPINE,2010,9570.769
18,CA,ALPINE,2011,1887.637
19,CA,ALPINE,2012,302.7
20,CA,ALPINE,2015,998.16
21,CA,ALPINE,2016,55.04
22,CA,AMADOR,2006,44730080.0
23,CA,AMADOR,2007,50602000.0


## Validate Filtering Results

Verify that extreme outliers are eliminated after removing upstream buyers.

In [10]:
# Merge filtered ARCOS with population and check for remaining extreme values
arcos_filtered_merged = arcos_filtered.merge(
    pop[["state_abbr", "county_clean", "year", "population"]],
    left_on=["BUYER_STATE", "BUYER_COUNTY", "YEAR"],
    right_on=["state_abbr", "county_clean", "year"],
    how="left",
)

arcos_filtered_merged["MME_per_capita"] = (
    arcos_filtered_merged["TOTAL_MME"] / arcos_filtered_merged["population"]
)

extreme_after = arcos_filtered_merged[arcos_filtered_merged["MME_per_capita"] > 100000]

print("Before filtering:")
print(f"  County-years: {len(df):,}")
print(f"  Extreme values: {len(extreme):,} ({100*len(extreme)/len(df):.2f}%)")
print(f"  Mean MME/capita: {df['MME_per_capita'].mean():,.0f}")
print(f"  Max MME/capita: {df['MME_per_capita'].max():,.0f}")

print("\nAfter filtering:")
print(f"  County-years: {len(arcos_filtered_merged):,}")
print(f"  Extreme values: {len(extreme_after):,}")
print(f"  Mean MME/capita: {arcos_filtered_merged['MME_per_capita'].mean():,.0f}")
print(f"  Max MME/capita: {arcos_filtered_merged['MME_per_capita'].max():,.0f}")

Before filtering:
  County-years: 12,393
  Extreme values: 119 (0.96%)
  Mean MME/capita: 987,200
  Max MME/capita: 2,534,741,636

After filtering:
  County-years: 10,660
  Extreme values: 3
  Mean MME/capita: 3,553
  Max MME/capita: 11,861,450


## Export Filtered Dataset

Save patient-level ARCOS data for use in subsequent merging and analysis steps.

In [11]:
# Export filtered ARCOS data after removing the most extreme outlier
import os

# Identify the single most extreme outlier data point (highest MME_per_capita before filtering)
most_extreme = df.sort_values("MME_per_capita", ascending=False).iloc[0]
# Filter out the most extreme data point from arcos_filtered before saving
arcos_clean = arcos_filtered.copy()
# Find the matching row in arcos_clean (by state, county, year, and TOTAL_MME) and drop it
mask = (
    (arcos_clean["BUYER_STATE"] == most_extreme["BUYER_STATE"])
    & (arcos_clean["BUYER_COUNTY"] == most_extreme["BUYER_COUNTY"])
    & (arcos_clean["YEAR"] == most_extreme["YEAR"])
    & (arcos_clean["TOTAL_MME"] == most_extreme["TOTAL_MME"])
)
arcos_clean = arcos_clean[~mask]

output_path = "../01_data/clean/arcos_end_use_only.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
arcos_clean.to_csv(output_path, index=False)

print(f"Exported: {output_path}")
print(f"Rows: {len(arcos_clean):,}")
print(f"Columns: {list(arcos_clean.columns)}")
print(
    "\nFiltered to patient-level dispensing only, with the single most extreme outlier removed:"
)
print("  - Retail/chain pharmacies")
print("  - Hospitals and clinics")
print("  - Practitioners and mid-level providers")

Exported: ../01_data/clean/arcos_end_use_only.csv
Rows: 9,948
Columns: ['BUYER_STATE', 'BUYER_COUNTY', 'YEAR', 'TOTAL_MME']

Filtered to patient-level dispensing only, with the single most extreme outlier removed:
  - Retail/chain pharmacies
  - Hospitals and clinics
  - Practitioners and mid-level providers
