Step 1: Load the data

In [1]:
import pandas as pd

pd.set_option("mode.copy_on_write", True)

In [2]:
import glob

parquet_files = glob.glob('*.parquet')

In [3]:
df_list = []
for file in parquet_files:
    df = pd.read_parquet(file, engine='pyarrow')
    df_list.append(df)

# Concatenate all data into one DataFrame
transactions = pd.concat(df_list, ignore_index=True)

In [4]:
transactions

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,TransactionYear,TransactionMonth,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,FIPS
0,FL,ALACHUA,HYDROCODONE,2006,78,17216.993951,3700934.0,12001
1,FL,ALACHUA,HYDROCODONE,2007,78,19020.741100,4052600.0,12001
2,FL,ALACHUA,HYDROCODONE,2008,78,20507.995075,4314690.0,12001
3,FL,ALACHUA,HYDROCODONE,2009,78,21623.919360,4476380.0,12001
4,FL,ALACHUA,HYDROCODONE,2010,78,19718.816370,4075540.0,12001
...,...,...,...,...,...,...,...,...
5743,OH,WYANDOT,Opioid,2011,9910,2528.534028,511100.0,39175
5744,OH,WYANDOT,Opioid,2012,10515,2406.424058,472600.0,39175
5745,OH,WYANDOT,Opioid,2013,10945,2405.013300,472320.0,39175
5746,OH,WYANDOT,Opioid,2014,8731,2344.653603,470800.0,39175


In [5]:
# get the unique states in transactions
states = transactions['BUYER_STATE'].unique()

In [6]:
population = pd.read_csv('population.csv')

In [7]:
# filter population by states
population_filtered = population[population['State'].isin(states)]

In [8]:
population_filtered

Unnamed: 0,FIPS,State,State_FIPS,County_FIPS,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
315,12001,FL,12,1,224614.0,227022.0,229867.0,233756.0,239506.0,242685.0,244888.0,246657.0,247624.0,249879.0,251596.0,252585.0,255606.0,259215.0,264127.0
316,12003,FL,12,3,23298.0,23555.0,24142.0,24832.0,25571.0,26212.0,26725.0,27124.0,27067.0,27055.0,27059.0,27008.0,27123.0,27357.0,27903.0
317,12005,FL,12,5,152741.0,155044.0,158804.0,162917.0,165644.0,165345.0,166267.0,167464.0,169209.0,169587.0,171818.0,174704.0,178435.0,181678.0,183765.0
318,12007,FL,12,7,26306.0,27035.0,27703.0,28098.0,28506.0,28825.0,28961.0,28979.0,28536.0,28430.0,27052.0,26804.0,26562.0,26759.0,26756.0
319,12009,FL,12,9,495425.0,504847.0,518722.0,529907.0,535138.0,539719.0,542378.0,542109.0,544000.0,544442.0,547119.0,550478.0,555838.0,566133.0,577380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,56037,WY,56,37,37428.0,37450.0,38026.0,38739.0,39749.0,41470.0,42358.0,44133.0,43580.0,44000.0,45032.0,45189.0,44996.0,44780.0,44319.0
3133,56039,WY,56,39,18837.0,19066.0,19467.0,19632.0,20014.0,20472.0,20988.0,21232.0,21298.0,21422.0,21643.0,22335.0,22801.0,23083.0,23255.0
3134,56041,WY,56,41,19587.0,19480.0,19470.0,19494.0,19709.0,20171.0,20613.0,21054.0,21090.0,20901.0,21008.0,20969.0,20835.0,20777.0,20711.0
3135,56043,WY,56,43,7988.0,7976.0,7960.0,8022.0,7979.0,8169.0,8229.0,8423.0,8531.0,8451.0,8410.0,8417.0,8277.0,8282.0,8180.0


In [17]:
transactions_cleaned = transactions
transactions_cleaned["Weighted_Dosage"] = transactions_cleaned["CALC_BASE_WT_IN_GM"]
#drop columns
transactions_cleaned = transactions_cleaned.drop(columns=['DRUG_NAME', 'TransactionMonth', 'CALC_BASE_WT_IN_GM', "DOSAGE_UNIT"])
#rename columns
transactions_cleaned = transactions_cleaned.rename(columns={"TransactionYear": "Year"})


In [18]:
transactions_cleaned

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,Year,FIPS,Weighted_Dosage
0,FL,ALACHUA,2006,12001,17216.993951
1,FL,ALACHUA,2007,12001,19020.741100
2,FL,ALACHUA,2008,12001,20507.995075
3,FL,ALACHUA,2009,12001,21623.919360
4,FL,ALACHUA,2010,12001,19718.816370
...,...,...,...,...,...
5743,OH,WYANDOT,2011,39175,2528.534028
5744,OH,WYANDOT,2012,39175,2406.424058
5745,OH,WYANDOT,2013,39175,2405.013300
5746,OH,WYANDOT,2014,39175,2344.653603


In [11]:
transactions['TransactionYear'].unique()

array([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015])

In [12]:
columns_to_melt = [ '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
melted_df = pd.melt(population_filtered, id_vars=['FIPS'], value_vars=columns_to_melt, var_name='Year', value_name='Population')
melted_df["Year"] = melted_df["Year"].astype(int)

In [26]:
melted_df

Unnamed: 0,FIPS,Year,Population
0,12001,2006,239506.0
1,12003,2006,25571.0
2,12005,2006,165644.0
3,12007,2006,28506.0
4,12009,2006,535138.0
...,...,...,...
5145,56037,2015,44780.0
5146,56039,2015,23083.0
5147,56041,2015,20777.0
5148,56043,2015,8282.0


In [27]:
merged_df = pd.merge(transactions_cleaned, melted_df, on=['Year', 'FIPS'], how='left')

In [33]:
#calculate dosage per 100,000 people
merged_df["Dosage_Per_100k"] = (merged_df["Weighted_Dosage"] / merged_df["Population"]) * 100_000

In [35]:
#store merged_df to parquet
merged_df.to_parquet('merged_transaction_population.parquet', engine='pyarrow')