# Importing relevant packages

In [1]:
import pandas as pd
import numpy as np
import warnings

pd.set_option("mode.copy_on_write", True)

# Reading and Loading (Takes 10 minutes to get relevant states)

In [2]:
# reading a small sample
all = pd.read_csv(
    "~/Desktop/arcos_all.zip",
    nrows=200,
    delimiter="\t",
    compression="zip",
)
all.head(5)

Unnamed: 0,REPORTER_DEA_NO,REPORTER_BUS_ACT,REPORTER_NAME,REPORTER_ADDL_CO_INFO,REPORTER_ADDRESS1,REPORTER_ADDRESS2,REPORTER_CITY,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,...,DRUG_NAME,Measure,MME_Conversion_Factor,Dosage_Strength,TRANSACTION_DATE,Combined_Labeler_Name,Reporter_family,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,MME
0,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,CODEINE,TAB,0.15,30.0,2011-01-14,Teva,McKesson Corporation,2.2101,100.0,331.515
1,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-02-08,"Par Pharmaceutical, Inc.",McKesson Corporation,0.8965,100.0,1344.75
2,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,7.5,2011-03-07,SpecGx LLC,McKesson Corporation,0.45405,100.0,454.05
3,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,METHADONE,TAB,4.0,10.0,2011-03-01,SpecGx LLC,McKesson Corporation,3.5776,400.0,14310.4
4,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-03-10,"Par Pharmaceutical, Inc.",McKesson Corporation,3.586,400.0,5379.0


In [3]:
# Columns to keep
variables_to_keep = [
    "BUYER_STATE",
    "BUYER_COUNTY",
    "DRUG_NAME",
    "TRANSACTION_DATE",
    "CALC_BASE_WT_IN_GM",
    "MME_Conversion_Factor",
]

# Chunking
chunk_size = 100000
chunk_list = []
states_to_keep = ["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"]


# Read in chunks, filtering for Texas and relevant columns
for chunk in pd.read_csv(
    "~/Desktop/arcos_all.zip",
    delimiter="\t",
    chunksize=chunk_size,
    compression="zip",
    usecols=variables_to_keep,
):
    filtered_chunk = chunk[chunk["BUYER_STATE"].isin(states_to_keep)]
    chunk_list.append(filtered_chunk)
df = pd.concat(chunk_list, ignore_index=True)
df

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,PAULDING,HYDROCODONE,1.00,2019-05-17,0.286354
1,GA,GWINNETT,FENTANYL,130.00,2019-05-07,0.018000
2,GA,GWINNETT,BUPRENORPHINE,30.00,2019-05-07,1.440000
3,GA,DOUGLAS,CODEINE,0.15,2019-06-19,2.210100
4,GA,RICHMOND,OXYCODONE,1.50,2019-06-19,1.344750
...,...,...,...,...,...,...
146109581,ME,KENNEBEC,HYDROCODONE,1.00,2006-12-20,0.143231
146109582,ME,KENNEBEC,METHADONE,4.00,2007-02-09,1.341600
146109583,ME,KENNEBEC,MORPHINE,1.00,2007-02-15,13.536000
146109584,ME,KENNEBEC,METHADONE,3.00,2007-03-16,1.609920


In [4]:
df_with_na = df[df.isnull().any(axis=1)]
df_with_na

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
36829300,GA,,HYDROCODONE,1.0,2006-12-01,0.572922
36829313,GA,,OXYCODONE,1.5,2007-04-24,94.122000
36831046,GA,,MORPHINE,1.0,2006-01-24,27.072000
36831047,GA,,OXYCODONE,1.5,2006-05-06,537.840000
36831048,GA,,HYDROCODONE,1.0,2006-06-05,0.859383
...,...,...,...,...,...,...
104094512,GA,,OXYCODONE,1.5,2006-01-25,0.448200
104094513,GA,,MORPHINE,1.0,2006-04-05,0.564000
104094514,GA,,MEPERIDINE,0.1,2006-06-08,2.178750
123272646,CO,,HYDROCODONE,1.0,2006-01-04,0.286461


# Calculating relevant data

In [5]:
# dropping NAs
all = df[
    df["BUYER_STATE"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])
].dropna()
all.head(5)
all

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,PAULDING,HYDROCODONE,1.00,2019-05-17,0.286354
1,GA,GWINNETT,FENTANYL,130.00,2019-05-07,0.018000
2,GA,GWINNETT,BUPRENORPHINE,30.00,2019-05-07,1.440000
3,GA,DOUGLAS,CODEINE,0.15,2019-06-19,2.210100
4,GA,RICHMOND,OXYCODONE,1.50,2019-06-19,1.344750
...,...,...,...,...,...,...
146109581,ME,KENNEBEC,HYDROCODONE,1.00,2006-12-20,0.143231
146109582,ME,KENNEBEC,METHADONE,4.00,2007-02-09,1.341600
146109583,ME,KENNEBEC,MORPHINE,1.00,2007-02-15,13.536000
146109584,ME,KENNEBEC,METHADONE,3.00,2007-03-16,1.609920


In [6]:
# Getting the year
all["date"] = pd.to_datetime(
    all["TRANSACTION_DATE"], format="%Y-%m-%d", errors="coerce"
)
all["year"] = all["date"].dt.year

all["DRUG_NAME"] = all["DRUG_NAME"].isin(["OXYCODONE", "HYDROCODONE"])

# Make an estimate of total morphine equivalent shipments
all["MME_Conversion_Factor"] = pd.to_numeric(
    all["MME_Conversion_Factor"], errors="coerce"
)
all["morphine_equivalent_g"] = all["CALC_BASE_WT_IN_GM"] * all["MME_Conversion_Factor"]
# Group by year, state, and county, and sum the morphine equivalent shipments
df_grouped = (
    all.groupby(["year", "BUYER_STATE", "BUYER_COUNTY"])
    .morphine_equivalent_g.sum()
    .reset_index()
)
df_grouped

Unnamed: 0,year,BUYER_STATE,BUYER_COUNTY,morphine_equivalent_g
0,2006,AL,AUTAUGA,53760.377117
1,2006,AL,BALDWIN,172218.789268
2,2006,AL,BARBOUR,12266.765677
3,2006,AL,BIBB,16893.219968
4,2006,AL,BLOUNT,26243.545303
...,...,...,...,...
7190,2019,WA,WAHKIAKUM,2291.476749
7191,2019,WA,WALLA WALLA,65141.315710
7192,2019,WA,WHATCOM,297094.148983
7193,2019,WA,WHITMAN,20404.550742


In [7]:
# check that the relevant states are in the dataset
assert len(df_grouped["BUYER_STATE"].unique()) == 8
df_grouped.head(5)

Unnamed: 0,year,BUYER_STATE,BUYER_COUNTY,morphine_equivalent_g
0,2006,AL,AUTAUGA,53760.377117
1,2006,AL,BALDWIN,172218.789268
2,2006,AL,BARBOUR,12266.765677
3,2006,AL,BIBB,16893.219968
4,2006,AL,BLOUNT,26243.545303


# Checking Florida & Constants GA, AL, OK

In [8]:
florida_and_constants = all[all["BUYER_STATE"].isin(["FL", "AL", "GA", "OK"])]
# florida has 67 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "FL"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 67
)
floridacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "FL"][
        "BUYER_COUNTY"
    ].unique()
)
# georgia has 159 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "GA"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 159
)
georgiacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "GA"][
        "BUYER_COUNTY"
    ].unique()
)
# oklahoma has 77 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "OK"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 77
)
oklahomacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "OK"][
        "BUYER_COUNTY"
    ].unique()
)
# alabama has 67 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "AL"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 67
)
alabamacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "AL"][
        "BUYER_COUNTY"
    ].unique()
)
print(florida_and_constants["year"].unique())
print(
    f"Florida has {floridacounty} counties, Georgia has {georgiacounty} counties, Oklahoma has {oklahomacounty} counties, Alabama has {alabamacounty} counties"
)
florida_and_constants.head(5)

[2019 2015 2016 2017 2018 2009 2010 2011 2012 2013 2014 2007 2008 2006]
Florida has 67 counties, Georgia has 155 counties, Oklahoma has 77 counties, Alabama has 67 counties


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM,date,year,morphine_equivalent_g
0,GA,PAULDING,True,1.0,2019-05-17,0.286354,2019-05-17,2019,0.286354
1,GA,GWINNETT,False,130.0,2019-05-07,0.018,2019-05-07,2019,2.34
2,GA,GWINNETT,False,30.0,2019-05-07,1.44,2019-05-07,2019,43.2
3,GA,DOUGLAS,False,0.15,2019-06-19,2.2101,2019-06-19,2019,0.331515
4,GA,RICHMOND,True,1.5,2019-06-19,1.34475,2019-06-19,2019,2.017125


# Checking Washington & Constants CO, OR, MN

In [9]:
washington_and_constants = all[all["BUYER_STATE"].isin(["WA", "CO", "OR", "ME"])]
# washington has 39 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 39
)
washingtoncounty = len(
    washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
        "BUYER_COUNTY"
    ].unique()
)
# colorado has 64 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "CO"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 64
)
coloradocounty = len(
    washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
        "BUYER_COUNTY"
    ].unique()
)
# oregon has 36 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "OR"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 36
)
# minnesota has 87 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "ME"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 16
)
print(washington_and_constants["year"].unique())
washington_and_constants.head(5)

[2019 2015 2016 2017 2018 2011 2012 2013 2014 2006 2007 2008 2009 2010]


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM,date,year,morphine_equivalent_g
32000000,WA,KING,True,1.0,2019-07-17,2.27025,2019-07-17,2019,2.27025
32000001,WA,KING,True,1.0,2019-07-17,2.27025,2019-07-17,2019,2.27025
32000002,WA,KING,True,1.0,2019-07-17,36.324,2019-07-17,2019,36.324
32000003,WA,PIERCE,False,100.0,2019-07-09,0.03,2019-07-09,2019,3.0
32000004,WA,PIERCE,False,4.0,2019-07-09,0.17732,2019-07-09,2019,0.70928


# SAVE TO PARQUET

In [10]:
# save to parquet
df_grouped.to_parquet("data/opioid_shipment_WA_FL_andconstants.parquet", index=False)