# Importing relevant packages

In [1]:
import pandas as pd
import numpy as np
import warnings

pd.set_option("mode.copy_on_write", True)

# Reading and Loading (Takes 10 minutes to get relevant states)

In [2]:
# reading a small sample
all = pd.read_csv(
    "~/Desktop/arcos_all_washpost.zip",
    nrows=200,
    delimiter="\t",
    compression="zip",
)
all.head(5)

Unnamed: 0,REPORTER_DEA_NO,REPORTER_BUS_ACT,REPORTER_NAME,REPORTER_ADDL_CO_INFO,REPORTER_ADDRESS1,REPORTER_ADDRESS2,REPORTER_CITY,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,...,DRUG_NAME,Measure,MME_Conversion_Factor,Dosage_Strength,TRANSACTION_DATE,Combined_Labeler_Name,Reporter_family,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,MME
0,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-02-08,"Par Pharmaceutical, Inc.",McKesson Corporation,0.8965,100.0,1344.75
1,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,7.5,2011-03-07,SpecGx LLC,McKesson Corporation,0.45405,100.0,454.05
2,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-03-10,"Par Pharmaceutical, Inc.",McKesson Corporation,3.586,400.0,5379.0
3,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,5.0,2011-04-05,SpecGx LLC,McKesson Corporation,1.5135,500.0,1513.5
4,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,5.0,2011-04-06,SpecGx LLC,McKesson Corporation,0.3027,100.0,302.7


In [3]:
# Columns to keep
variables_to_keep = [
    "BUYER_STATE",
    "BUYER_COUNTY",
    "DRUG_NAME",
    "TRANSACTION_DATE",
    "CALC_BASE_WT_IN_GM",
    "MME_Conversion_Factor",
]

# Chunking
chunk_size = 100000
chunk_list = []
states_to_keep = ["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"]


# Read in chunks, filtering for Texas and relevant columns
for chunk in pd.read_csv(
    "~/Desktop/arcos_all_washpost.zip",
    delimiter="\t",
    chunksize=chunk_size,
    compression="zip",
    usecols=variables_to_keep,
):
    filtered_chunk = chunk[chunk["BUYER_STATE"].isin(states_to_keep)]
    chunk_list.append(filtered_chunk)
df = pd.concat(chunk_list, ignore_index=True)
df

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,RICHMOND,OXYCODONE,1.5,2019-06-19,1.344750
1,GA,COWETA,OXYCODONE,1.5,2019-06-21,3.361875
2,GA,CLARKE,HYDROCODONE,1.0,2019-06-21,2.270250
3,GA,WHITFIELD,HYDROCODONE,1.0,2019-06-21,2.270250
4,GA,COBB,HYDROCODONE,1.0,2019-06-21,2.270250
...,...,...,...,...,...,...
66662233,ME,KENNEBEC,OXYCODONE,1.5,2006-01-12,3.586000
66662234,ME,KENNEBEC,OXYCODONE,1.5,2006-03-01,11.206250
66662235,ME,KENNEBEC,OXYCODONE,1.5,2006-03-15,1.344750
66662236,ME,KENNEBEC,OXYCODONE,1.5,2006-05-24,10.758000


In [14]:
df_with_na = df[df.isnull().any(axis=1)]
df_with_na

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
23064093,FL,,HYDROCODONE,1.0,2007-02-12,0.3027
23996263,FL,,HYDROCODONE,1.0,2006-10-12,0.3027
24532643,FL,,HYDROCODONE,1.0,2006-06-22,0.3027
24625178,FL,,HYDROCODONE,1.0,2007-01-22,0.3027
25095157,FL,,HYDROCODONE,1.0,2007-05-14,0.3027
25633942,FL,,HYDROCODONE,1.0,2007-04-02,0.3027
26435057,FL,,HYDROCODONE,1.0,2007-06-07,0.3027
26820801,FL,,HYDROCODONE,1.0,2006-09-07,0.3027
27705106,FL,,HYDROCODONE,1.0,2006-12-06,0.3027
28319707,FL,,HYDROCODONE,1.0,2006-03-02,0.3027


# Calculating relevant data

In [5]:
# dropping NAs
all = df[
    df["BUYER_STATE"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])
].dropna()
all.head(5)
all

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,RICHMOND,OXYCODONE,1.5,2019-06-19,1.344750
1,GA,COWETA,OXYCODONE,1.5,2019-06-21,3.361875
2,GA,CLARKE,HYDROCODONE,1.0,2019-06-21,2.270250
3,GA,WHITFIELD,HYDROCODONE,1.0,2019-06-21,2.270250
4,GA,COBB,HYDROCODONE,1.0,2019-06-21,2.270250
...,...,...,...,...,...,...
66662233,ME,KENNEBEC,OXYCODONE,1.5,2006-01-12,3.586000
66662234,ME,KENNEBEC,OXYCODONE,1.5,2006-03-01,11.206250
66662235,ME,KENNEBEC,OXYCODONE,1.5,2006-03-15,1.344750
66662236,ME,KENNEBEC,OXYCODONE,1.5,2006-05-24,10.758000


In [6]:
# Getting the year
all["date"] = pd.to_datetime(
    all["TRANSACTION_DATE"], format="%Y-%m-%d", errors="coerce"
)
all["year"] = all["date"].dt.year

all["DRUG_NAME"] = all["DRUG_NAME"].isin(["OXYCODONE", "HYDROCODONE"])

# Make an estimate of total morphine equivalent shipments
all["MME_Conversion_Factor"] = pd.to_numeric(
    all["MME_Conversion_Factor"], errors="coerce"
)
all["morphine_equivalent_g"] = all["CALC_BASE_WT_IN_GM"] * all["MME_Conversion_Factor"]
# Group by year, state, and county, and sum the morphine equivalent shipments
df_grouped = (
    all.groupby(["year", "BUYER_STATE", "BUYER_COUNTY"])
    .morphine_equivalent_g.sum()
    .reset_index()
)
df_grouped

Unnamed: 0,year,BUYER_STATE,BUYER_COUNTY,morphine_equivalent_g
0,2006,AL,AUTAUGA,17100.542225
1,2006,AL,BALDWIN,51039.829050
2,2006,AL,BARBOUR,5764.019122
3,2006,AL,BIBB,6278.096539
4,2006,AL,BLOUNT,10418.423330
...,...,...,...,...
7123,2019,WA,WAHKIAKUM,932.537438
7124,2019,WA,WALLA WALLA,13420.082768
7125,2019,WA,WHATCOM,40160.700638
7126,2019,WA,WHITMAN,7147.307865


In [7]:
# check that the relevant states are in the dataset
assert len(df_grouped["BUYER_STATE"].unique()) == 8
df_grouped.head(5)

Unnamed: 0,year,BUYER_STATE,BUYER_COUNTY,morphine_equivalent_g
0,2006,AL,AUTAUGA,17100.542225
1,2006,AL,BALDWIN,51039.82905
2,2006,AL,BARBOUR,5764.019122
3,2006,AL,BIBB,6278.096539
4,2006,AL,BLOUNT,10418.42333


# Checking Florida & Constants GA, AL, OK

In [None]:
florida_and_constants = all[all["BUYER_STATE"].isin(["FL", "AL", "GA", "OK"])]
# florida has 67 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "FL"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 67
)
floridacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "FL"][
        "BUYER_COUNTY"
    ].unique()
)
# georgia has 159 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "GA"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 159
)
georgiacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "GA"][
        "BUYER_COUNTY"
    ].unique()
)
# oklahoma has 77 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "OK"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 77
)
oklahomacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "OK"][
        "BUYER_COUNTY"
    ].unique()
)
# alabama has 67 counties
assert (
    len(
        florida_and_constants[florida_and_constants["BUYER_STATE"] == "AL"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 67
)
alabamacounty = len(
    florida_and_constants[florida_and_constants["BUYER_STATE"] == "AL"][
        "BUYER_COUNTY"
    ].unique()
)
print(florida_and_constants["year"].unique())
print(
    f"Florida has {floridacounty} counties, Georgia has {georgiacounty} counties, Oklahoma has {oklahomacounty} counties, Alabama has {alabamacounty} counties"
)
florida_and_constants.head(5)

[2019 2015 2016 2017 2018 2009 2010 2011 2012 2013 2014 2007 2008 2006]
Florida has 67 counties, Georgia has 155 counties, Oklahoma has 77 counties, Alabama has 67 counties


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM,date,year,morphine_equivalent_g
0,GA,RICHMOND,True,1.5,2019-06-19,1.34475,2019-06-19,2019,2.017125
1,GA,COWETA,True,1.5,2019-06-21,3.361875,2019-06-21,2019,5.042813
2,GA,CLARKE,True,1.0,2019-06-21,2.27025,2019-06-21,2019,2.27025
3,GA,WHITFIELD,True,1.0,2019-06-21,2.27025,2019-06-21,2019,2.27025
4,GA,COBB,True,1.0,2019-06-21,2.27025,2019-06-21,2019,2.27025


# Checking Washington & Constants CO, OR, MN

In [None]:
washington_and_constants = all[all["BUYER_STATE"].isin(["WA", "CO", "OR", "ME"])]
# washington has 39 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 39
)
washingtoncounty = len(
    washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
        "BUYER_COUNTY"
    ].unique()
)
# colorado has 64 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "CO"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 64
)
coloradocounty = len(
    washington_and_constants[washington_and_constants["BUYER_STATE"] == "WA"][
        "BUYER_COUNTY"
    ].unique()
)
# oregon has 36 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "OR"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 36
)
# minnesota has 87 counties
assert (
    len(
        washington_and_constants[washington_and_constants["BUYER_STATE"] == "ME"][
            "BUYER_COUNTY"
        ].unique()
    )
    <= 16
)
print(washington_and_constants["year"].unique())
washington_and_constants.head(5)

[2019 2015 2016 2017 2018 2011 2012 2013 2014 2006 2007 2008 2009 2010]


Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM,date,year,morphine_equivalent_g
13052308,WA,BENTON,True,1.5,2019-07-21,0.672375,2019-07-21,2019,1.008563
13052309,WA,BENTON,True,1.5,2019-07-21,5.379,2019-07-21,2019,8.0685
13052310,WA,BENTON,True,1.5,2019-07-21,1.793,2019-07-21,2019,2.6895
13052311,WA,BENTON,True,1.0,2019-07-21,1.5135,2019-07-21,2019,1.5135
13052312,WA,BENTON,True,1.0,2019-07-21,2.27025,2019-07-21,2019,2.27025


# SAVE TO PARQUET

In [10]:
# save to parquet
df_grouped.to_parquet("data/opioid_shipment_WA_FL_andconstants.parquet", index=False)