# Importing relevant packages

In [1]:
import pandas as pd
import numpy as np
import warnings

pd.set_option("mode.copy_on_write", True)

# Reading and Loading (Takes xx minutes to get those states)

In [2]:
# reading a small sample
all = pd.read_csv(
    "~/Desktop/arcos_all_washpost.zip",
    nrows=200,
    delimiter="\t",
    compression="zip",
)
all.head(5)

Unnamed: 0,REPORTER_DEA_NO,REPORTER_BUS_ACT,REPORTER_NAME,REPORTER_ADDL_CO_INFO,REPORTER_ADDRESS1,REPORTER_ADDRESS2,REPORTER_CITY,REPORTER_STATE,REPORTER_ZIP,REPORTER_COUNTY,...,DRUG_NAME,Measure,MME_Conversion_Factor,Dosage_Strength,TRANSACTION_DATE,Combined_Labeler_Name,Reporter_family,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,MME
0,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-02-08,"Par Pharmaceutical, Inc.",McKesson Corporation,0.8965,100.0,1344.75
1,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,7.5,2011-03-07,SpecGx LLC,McKesson Corporation,0.45405,100.0,454.05
2,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,OXYCODONE,TAB,1.5,10.0,2011-03-10,"Par Pharmaceutical, Inc.",McKesson Corporation,3.586,400.0,5379.0
3,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,5.0,2011-04-05,SpecGx LLC,McKesson Corporation,1.5135,500.0,1513.5
4,RM0220688,DISTRIBUTOR,MCKESSON CORPORATION,,DBA MCKESSON DRUG CO.,3000 KENSKILL AVE,WASHINGTON CT HOUSE,OH,43160,FAYETTE,...,HYDROCODONE,TAB,1.0,5.0,2011-04-06,SpecGx LLC,McKesson Corporation,0.3027,100.0,302.7


In [4]:
# Columns to keep
variables_to_keep = [
    "BUYER_STATE",
    "BUYER_COUNTY",
    "DRUG_NAME",
    "TRANSACTION_DATE",
    "CALC_BASE_WT_IN_GM",
    "MME_Conversion_Factor",
]

# Chunking
chunk_size = 100000
chunk_list = []
states_to_keep = ["FL", "WA", "OR", "GA", "OK", "AL", "CO", "MN"]


# Read in chunks, filtering for Texas and relevant columns
for chunk in pd.read_csv(
    "~/Desktop/arcos_all_washpost.zip",
    delimiter="\t",
    chunksize=chunk_size,
    compression="zip",
    usecols=variables_to_keep,
):
    filtered_chunk = chunk[chunk["BUYER_STATE"].isin(states_to_keep)]
    chunk_list.append(filtered_chunk)
df = pd.concat(chunk_list, ignore_index=True)
df

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,RICHMOND,OXYCODONE,1.5,2019-06-19,1.344750
1,GA,COWETA,OXYCODONE,1.5,2019-06-21,3.361875
2,GA,CLARKE,HYDROCODONE,1.0,2019-06-21,2.270250
3,GA,WHITFIELD,HYDROCODONE,1.0,2019-06-21,2.270250
4,GA,COBB,HYDROCODONE,1.0,2019-06-21,2.270250
...,...,...,...,...,...,...
69654434,OR,MULTNOMAH,OXYCODONE,1.5,2010-12-13,5.379000
69654435,OR,MULTNOMAH,OXYCODONE,1.5,2011-01-03,29.584500
69654436,OR,MULTNOMAH,OXYCODONE,1.5,2011-02-15,8.068500
69654437,OR,MULTNOMAH,HYDROCODONE,1.0,2011-02-18,7.567500


# Calculating relevant data

In [5]:
all = df[df["BUYER_STATE"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "MN"])]
all.head(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM
0,GA,RICHMOND,OXYCODONE,1.5,2019-06-19,1.34475
1,GA,COWETA,OXYCODONE,1.5,2019-06-21,3.361875
2,GA,CLARKE,HYDROCODONE,1.0,2019-06-21,2.27025
3,GA,WHITFIELD,HYDROCODONE,1.0,2019-06-21,2.27025
4,GA,COBB,HYDROCODONE,1.0,2019-06-21,2.27025


In [7]:
# Getting the year
all["date"] = pd.to_datetime(
    all["TRANSACTION_DATE"], format="%Y-%m-%d", errors="coerce"
)
all["year"] = all["date"].dt.year

# Make an estimate of total morphine equivalent shipments
all["MME_Conversion_Factor"] = pd.to_numeric(
    all["MME_Conversion_Factor"], errors="coerce"
)
all["morphine_equivalent_g"] = all["CALC_BASE_WT_IN_GM"] * all["MME_Conversion_Factor"]
# Group by year, state, and county, and sum the morphine equivalent shipments
df_grouped = (
    all.groupby(["year", "BUYER_STATE", "BUYER_COUNTY"])
    .morphine_equivalent_g.sum()
    .reset_index()
)
df_grouped

Unnamed: 0,year,BUYER_STATE,BUYER_COUNTY,morphine_equivalent_g
0,2006,AL,AUTAUGA,17100.542225
1,2006,AL,BALDWIN,51039.829050
2,2006,AL,BARBOUR,5764.019122
3,2006,AL,BIBB,6278.096539
4,2006,AL,BLOUNT,10418.423330
...,...,...,...,...
8117,2019,WA,WAHKIAKUM,932.537438
8118,2019,WA,WALLA WALLA,13420.082768
8119,2019,WA,WHATCOM,40160.700638
8120,2019,WA,WHITMAN,7147.307865


In [8]:
# check that the relevant states are in the dataset
assert len(df_grouped["BUYER_STATE"].unique()) == 8

In [10]:
# save to parquet
df_grouped.to_parquet("data/opioid.parquet", index=False)

# Florida & Constants GA, AL, OK

## Subsetting Opioid Shipments

In [None]:
florida_and_constants = all[all["BUYER_STATE"].isin["FL", "OK", "GA", "OK"]]
# florida has 67 counties
assert len(florida_and_constants["BUYER_COUNTY"].unique()) == 67
# georgia has 159 counties
# oklahoma has 77 counties
# alabama has 67 counties
print(florida["year"].unique())
florida.head(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,CALC_BASE_WT_IN_GM,date,year,morphine_equivalent_g
0,FL,SAINT JOHNS,OXYCODONE,1.5,2016-08-10,0.8965,2016-08-10,2016,1.34475
1,FL,PALM BEACH,OXYCODONE,1.5,2016-08-10,0.8965,2016-08-10,2016,1.34475
2,FL,DUVAL,OXYCODONE,1.5,2016-08-10,0.8965,2016-08-10,2016,1.34475
3,FL,MARION,OXYCODONE,1.5,2016-08-10,0.8965,2016-08-10,2016,1.34475
4,FL,LEE,OXYCODONE,1.5,2016-08-22,3.586,2016-08-22,2016,5.379


## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_fl = 2007
pre_end_date_fl = 2009
# Post time period
post_start_date_fl = 2010
post_end_date_fl = 2013

pre_florida = florida[
    (florida["year"] >= pre_start_date_fl) & (florida["year"] <= pre_end_date_fl)
]
pre_florida.head(5)

post_florida = florida[
    (florida["year"] >= post_start_date_fl) & (florida["year"] <= post_end_date_fl)
]
post_florida.head(5)

# Washington

## Subsetting Washington Opioid Shipments

In [None]:
washington = all[all["BUYER_STATE"] == "WA"]
# washington has 39 counties
assert len(washington["BUYER_COUNTY"].unique()) == 39
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Oregon

## Subsetting Oregon Opioid Shipments

In [None]:
oregon = all[all["BUYER_STATE"] == "OR"]
# oregon has 36 counties
assert len(oregon["BUYER_COUNTY"].unique()) == 36
print(oregon["year"].unique())
oregon.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_or = 2007
pre_end_date_or = 2009
# Post time period
post_start_date_wa = 2010
post_end_date_wa = 2013

pre_oregon = oregon[
    (oregon["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Minnesota

## Subsetting Minnesota Opioid Shipments

In [None]:
washington = all[all["BUYER_STATE"] == "WA"]
# washington has 39 counties
assert len(washington["BUYER_COUNTY"].unique()) == 39
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Colorado

## Subsetting Colorado Opioid Shipments

In [None]:
colorado = all[all["BUYER_STATE"] == "WA"]
# washington has 36 counties
assert len(washington["BUYER_COUNTY"].unique()) == 36
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Georgia

## Subsetting Georgia Opioid Shipments

In [None]:
washington = all[all["BUYER_STATE"] == "WA"]
# washington has 39 counties
assert len(washington["BUYER_COUNTY"].unique()) == 39
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Alabama

## Subsetting Alabama Opioid Shipments

In [None]:
washington = all[all["BUYER_STATE"] == "WA"]
# washington has 39 counties
assert len(washington["BUYER_COUNTY"].unique()) == 39
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)

# Oklahoma

## Subsetting Oklahoma Opioid Shipments

In [None]:
washington = all[all["BUYER_STATE"] == "WA"]
# washington has 39 counties
assert len(washington["BUYER_COUNTY"].unique()) == 39
print(washington["year"].unique())
washington.head(5)

## Narrowing it down to the relevant years: Dec 2007 - Dec 2009  and Jan 2010 - Jan 2013

In [None]:
# Pre time period
pre_start_date_wa = 2009
pre_end_date_wa = 2011
# Post time period
post_start_date_wa = 2012
post_end_date_wa = 2015

pre_washington = washington[
    (washington["year"] >= pre_start_date_wa) & (washington["year"] <= pre_end_date_wa)
]
pre_washington.head(5)

post_washington = washington[
    (washington["year"] >= post_start_date_wa)
    & (washington["year"] <= post_end_date_wa)
]
post_washington.head(5)