# Investigate Extreme Values in ARCOS Data

Identify and filter out non-patient-level transactions (manufacturers, distributors) that create extreme outliers in per-capita opioid metrics.

## Setup

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("mode.copy_on_write", True)
pd.set_option("display.max_columns", None)

## Load Processed ARCOS Data

In [25]:
# Load aggregated ARCOS data from processing step
arcos = pd.read_csv("../01_data/clean/arcos_by_county_year.csv")

print(f"ARCOS data shape: {arcos.shape}")
print(f"\nColumns: {arcos.columns.tolist()}")
print(f"\nYear range: {arcos['YEAR'].min()} to {arcos['YEAR'].max()}")
print(f"States: {sorted(arcos['BUYER_STATE'].unique())}")

arcos.head()

ARCOS data shape: (7828, 6)

Columns: ['BUYER_STATE', 'BUYER_COUNTY', 'YEAR', 'TOTAL_MME', 'BUYER_BUS_ACT', 'unique_buyers']

Year range: 2006 to 2019
States: ['FL', 'GA', 'ID', 'MT', 'NC', 'OR', 'SC', 'UT', 'WA']


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,BUYER_BUS_ACT,unique_buyers
0,FL,ALACHUA,2006,335318400.0,"ANALYTICAL LAB,CHAIN PHARMACY,HOSP/CLINIC-VA,H...",2373
1,FL,ALACHUA,2007,381327500.0,"CANINE HANDLER,CHAIN PHARMACY,DISTRIBUTOR,HOSP...",2529
2,FL,ALACHUA,2008,475070400.0,"CHAIN PHARMACY,HOSP/CLINIC-VA,HOSPITAL/CLINIC,...",2619
3,FL,ALACHUA,2009,496969600.0,"CHAIN PHARMACY,HOSP/CLINIC-VA,HOSPITAL/CLINIC,...",2757
4,FL,ALACHUA,2010,504114400.0,"CHAIN PHARMACY,HOSP/CLINIC-VA,HOSPITAL/CLINIC,...",2558


## Load Population Data

In [26]:
pop = pd.read_csv("../01_data/clean/population_2000_2024.csv")

# Map state names to abbreviations
pop["state_abbr"] = pop["STNAME"].map(
    {
        "Florida": "FL",
        "Washington": "WA",
        "North Carolina": "NC",
        "Georgia": "GA",
        "Oregon": "OR",
        "South Carolina": "SC",
        "Idaho": "ID",
        "Montana": "MT",
        "Utah": "UT",
    }
)

# Clean county names to match ARCOS format
pop["county_clean"] = (
    pop["CTYNAME"].str.upper().str.replace(" COUNTY", "").str.replace(" PARISH", "")
)

print(f"Population data shape: {pop.shape}")

Population data shape: (14976, 9)


## Calculate Per-Capita MME by County-Year

In [27]:
df = arcos.merge(
    pop[["state_abbr", "county_clean", "year", "population"]],
    left_on=["BUYER_STATE", "BUYER_COUNTY", "YEAR"],
    right_on=["state_abbr", "county_clean", "year"],
    how="left",
)

print(f"Merged data shape: {df.shape}")
print(f"Records without population match: {df['population'].isna().sum()}")

df["MME_per_capita"] = df["TOTAL_MME"] / df["population"]

print(f"\nPer-capita MME statistics:")
print(df["MME_per_capita"].describe())

Merged data shape: (8386, 10)
Records without population match: 46

Per-capita MME statistics:
count    8.340000e+03
mean     1.453856e+06
std      4.544136e+07
min      1.141389e-04
25%      5.736993e+02
50%      9.053719e+02
75%      1.399344e+03
max      2.534742e+09
Name: MME_per_capita, dtype: float64


## Identify Extreme Outliers

Counties with >100,000 mg MME per capita annually indicate data quality issues (likely manufacturer/distributor transactions).

In [28]:
df["is_extreme"] = df["MME_per_capita"] > 100000
extreme = df[df["is_extreme"]].copy()

print(f"Extreme observations: {len(extreme)} ({100*len(extreme)/len(df):.2f}%)")
print(f"\nTop 10 extreme values:")

extreme[
    ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME", "population", "MME_per_capita"]
].sort_values("MME_per_capita", ascending=False).head(10)

Extreme observations: 89 (1.06%)

Top 10 extreme values:


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,population,MME_per_capita
6132,NC,WILSON,2010,206059300000000.0,81294.0,2534742000.0
6131,NC,WILSON,2010,206059300000000.0,81359.0,2532717000.0
6137,NC,WILSON,2015,96750170000000.0,81240.0,1190918000.0
1969,GA,HALL,2009,197694900000000.0,178503.0,1107516000.0
5775,NC,PITT,2013,127358500000000.0,174380.0,730350300.0
1972,GA,HALL,2011,117879600000000.0,182269.0,646734000.0
84,FL,BROWARD,2014,740037000000000.0,1860933.0,397669900.0
1973,GA,HALL,2012,62338490000000.0,184102.0,338608500.0
1971,GA,HALL,2010,57921740000000.0,180033.0,321728500.0
1970,GA,HALL,2010,57921740000000.0,180253.0,321335800.0


## Analyze Buyer Type Patterns

Compare buyer business activities between extreme and normal counties to identify root cause.

In [29]:
print("Buyer patterns in extreme vs normal counties:\n")
print("Top 10 extreme counties:")
print(
    extreme.nlargest(10, "MME_per_capita")[
        ["BUYER_STATE", "BUYER_COUNTY", "YEAR", "MME_per_capita", "BUYER_BUS_ACT"]
    ]
)

print("\nExtreme buyer activity distribution:")
print(extreme["BUYER_BUS_ACT"].value_counts().head(10))

print("\nNormal buyer activity distribution:")
print(df[~df["is_extreme"]]["BUYER_BUS_ACT"].value_counts().head(10))

Buyer patterns in extreme vs normal counties:

Top 10 extreme counties:
     BUYER_STATE BUYER_COUNTY  YEAR  MME_per_capita  \
6132          NC       WILSON  2010    2.534742e+09   
6131          NC       WILSON  2010    2.532717e+09   
6137          NC       WILSON  2015    1.190918e+09   
1969          GA         HALL  2009    1.107516e+09   
5775          NC         PITT  2013    7.303503e+08   
1972          GA         HALL  2011    6.467340e+08   
84            FL      BROWARD  2014    3.976699e+08   
1973          GA         HALL  2012    3.386085e+08   
1971          GA         HALL  2010    3.217285e+08   
1970          GA         HALL  2010    3.213358e+08   

                                          BUYER_BUS_ACT  
6132  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
6131  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
6137  ANALYTICAL LAB,CHAIN PHARMACY,DISTRIBUTOR,EXPO...  
1969  ANALYTICAL LAB,CHAIN PHARMACY,EXPORTER,HOSPITA...  
5775  ANALYTICAL LAB,CHAIN PHARM

## Verify Population Data Quality

Check if extreme values could be due to incorrect population figures.

In [30]:
# Compare population distributions between extreme and normal counties
print("Population comparison:")
print(f"\nExtreme counties population stats:")
print(extreme["population"].describe())

print(f"\nNormal counties population stats:")
print(df[~df["is_extreme"]]["population"].describe())

# Identify extreme counties with smallest populations
print("\nExtreme counties with smallest populations:")
print(
    extreme.nsmallest(10, "population")[
        [
            "BUYER_STATE",
            "BUYER_COUNTY",
            "YEAR",
            "population",
            "TOTAL_MME",
            "MME_per_capita",
        ]
    ]
)

Population comparison:

Extreme counties population stats:
count    8.900000e+01
mean     6.169746e+05
std      6.165656e+05
min      7.737900e+04
25%      1.198910e+05
50%      2.204810e+05
75%      1.047610e+06
max      1.912583e+06
Name: population, dtype: float64

Normal counties population stats:
count    8.251000e+03
mean     1.020643e+05
std      2.209483e+05
min      6.730000e+02
25%      1.341350e+04
50%      2.999200e+04
75%      9.129800e+04
max      2.716940e+06
Name: population, dtype: float64

Extreme counties with smallest populations:
     BUYER_STATE BUYER_COUNTY  YEAR  population     TOTAL_MME  MME_per_capita
6127          NC       WILSON  2006     77379.0  3.951586e+10    5.106793e+05
6128          NC       WILSON  2007     78474.0  3.218629e+10    4.101523e+05
6129          NC       WILSON  2008     79790.0  9.145911e+10    1.146248e+06
6130          NC       WILSON  2009     80664.0  1.276553e+11    1.582556e+06
6136          NC       WILSON  2014     81007.0  9.60

## Filter to Patient-Level Transactions Only

Remove upstream supply chain entities (manufacturers, distributors) to retain only patient-level dispensing.

In [31]:
end_use_buyers = [
    "RETAIL PHARMACY",
    "CHAIN PHARMACY",
    "HOSPITAL/CLINIC",
    "HOSP/CLINIC-VA",
    "PRACTITIONER",
    "MLP-PHYSICIAN ASSISTANT",
    "MLP-NURSE PRACTITIONER",
    "MLP-CLINICAL NURSE SPECIALIST",
    "MLP-CERTIFIED NURSE MIDWIFE",
    "MLP-NURSE ANESTHETIST",
    "TEACHING INSTITUTION",
]

print("Patient-level buyer types:")
for b in end_use_buyers:
    print(f"  {b}")

Patient-level buyer types:
  RETAIL PHARMACY
  CHAIN PHARMACY
  HOSPITAL/CLINIC
  HOSP/CLINIC-VA
  PRACTITIONER
  MLP-PHYSICIAN ASSISTANT
  MLP-NURSE PRACTITIONER
  MLP-CLINICAL NURSE SPECIALIST
  MLP-CERTIFIED NURSE MIDWIFE
  MLP-NURSE ANESTHETIST
  TEACHING INSTITUTION


In [32]:
# Remove non-patient-level buyer types
upstream_buyers = [
    "MANUFACTURER",
    "DISTRIBUTOR",
    "EXPORTER",
    "ANALYTICAL LAB",
    "CANINE HANDLER",
]

arcos_filtered = arcos[
    ~arcos["BUYER_BUS_ACT"].str.contains(
        "|".join(upstream_buyers), regex=True, na=False
    )
].copy()

# Keep only essential columns for merging
arcos_filtered = arcos_filtered[["BUYER_STATE", "BUYER_COUNTY", "YEAR", "TOTAL_MME"]]

print(f"Before filtering: {len(arcos):,} county-years")
print(f"After filtering: {len(arcos_filtered):,} county-years")
print(
    f"Removed: {len(arcos) - len(arcos_filtered):,} ({100*(len(arcos) - len(arcos_filtered))/len(arcos):.1f}%)"
)

arcos_filtered.head(10)

Before filtering: 7,828 county-years
After filtering: 6,879 county-years
Removed: 949 (12.1%)


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME
2,FL,ALACHUA,2008,475070400.0
3,FL,ALACHUA,2009,496969600.0
4,FL,ALACHUA,2010,504114400.0
5,FL,ALACHUA,2011,508708200.0
6,FL,ALACHUA,2012,491362900.0
8,FL,ALACHUA,2014,448034100.0
9,FL,ALACHUA,2015,487737400.0
10,FL,ALACHUA,2016,489106800.0
11,FL,ALACHUA,2017,508395700.0
12,FL,ALACHUA,2018,843077700.0


## Validate Filtering Results

Verify that extreme outliers are eliminated after removing upstream buyers.

In [33]:
arcos_filtered_merged = arcos_filtered.merge(
    pop[["state_abbr", "county_clean", "year", "population"]],
    left_on=["BUYER_STATE", "BUYER_COUNTY", "YEAR"],
    right_on=["state_abbr", "county_clean", "year"],
    how="left",
)

arcos_filtered_merged["MME_per_capita"] = (
    arcos_filtered_merged["TOTAL_MME"] / arcos_filtered_merged["population"]
)

extreme_after = arcos_filtered_merged[arcos_filtered_merged["MME_per_capita"] > 100000]

print("Before filtering:")
print(f"  County-years: {len(df):,}")
print(f"  Extreme values: {len(extreme):,} ({100*len(extreme)/len(df):.2f}%)")
print(f"  Mean MME/capita: {df['MME_per_capita'].mean():,.0f}")
print(f"  Max MME/capita: {df['MME_per_capita'].max():,.0f}")

print("\nAfter filtering:")
print(f"  County-years: {len(arcos_filtered_merged):,}")
print(f"  Extreme values: {len(extreme_after):,}")
print(f"  Mean MME/capita: {arcos_filtered_merged['MME_per_capita'].mean():,.0f}")
print(f"  Max MME/capita: {arcos_filtered_merged['MME_per_capita'].max():,.0f}")

Before filtering:
  County-years: 8,386
  Extreme values: 89 (1.06%)
  Mean MME/capita: 1,453,856
  Max MME/capita: 2,534,741,636

After filtering:
  County-years: 7,369
  Extreme values: 2
  Mean MME/capita: 4,221
  Max MME/capita: 11,861,450


## Export Filtered Dataset

Save patient-level ARCOS data for use in subsequent merging and analysis steps.

In [34]:
import os

# Identify the single most extreme outlier data point (highest MME_per_capita before filtering)
most_extreme = df.sort_values("MME_per_capita", ascending=False).iloc[0]
# Filter out the most extreme data point from arcos_filtered before saving
arcos_clean = arcos_filtered.copy()
# Find the matching row in arcos_clean (by state, county, year, and TOTAL_MME) and drop it
mask = (
    (arcos_clean["BUYER_STATE"] == most_extreme["BUYER_STATE"])
    & (arcos_clean["BUYER_COUNTY"] == most_extreme["BUYER_COUNTY"])
    & (arcos_clean["YEAR"] == most_extreme["YEAR"])
    & (arcos_clean["TOTAL_MME"] == most_extreme["TOTAL_MME"])
    # If there are duplicate rows, this will remove all matches; adjust as needed
)
arcos_clean = arcos_clean[~mask]

output_path = "../01_data/clean/arcos_end_use_only.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
arcos_clean.to_csv(output_path, index=False)

print(f"Exported: {output_path}")
print(f"Rows: {len(arcos_clean):,}")
print(f"Columns: {list(arcos_clean.columns)}")
print(
    "\nFiltered to patient-level dispensing only, with the single most extreme outlier removed:"
)
print("  - Retail/chain pharmacies")
print("  - Hospitals and clinics")
print("  - Practitioners and mid-level providers")

Exported: ../01_data/clean/arcos_end_use_only.csv
Rows: 6,879
Columns: ['BUYER_STATE', 'BUYER_COUNTY', 'YEAR', 'TOTAL_MME']

Filtered to patient-level dispensing only, with the single most extreme outlier removed:
  - Retail/chain pharmacies
  - Hospitals and clinics
  - Practitioners and mid-level providers
