### Parse source data to parquet

Note: All files has been parsed and upload to the Intermediate_Files Folder

In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

### Washington

### Read in raw data

In [3]:
# as raw data file is large for github, download data to local
# adjust the path to your local path
path = "../00_Source_Data/arcos_all_washpost.tsv"
csv_chunk = pd.read_table(path, chunksize=50_000, low_memory=False)

In [4]:
Washington = []
for i, chunk in enumerate(csv_chunk):
    append_chunk = chunk.loc[chunk["BUYER_STATE"] == "WA"]
    Washington.append(append_chunk)
data_selected_Washington = pd.concat(Washington)

### Texas

In [8]:
# as raw data file is large for github, download data to local
# adjust the path to your local path
path = "../00_Source_Data/arcos_all_washpost.tsv"
csv_chunk = pd.read_table(path, chunksize=50_000, low_memory=False)

In [9]:
Texas = []
for i, chunk in enumerate(csv_chunk):
    append_chunk = chunk.loc[chunk["BUYER_STATE"] == "TX"]
    Texas.append(append_chunk)
data_selected_Texas = pd.concat(Texas)

### Florida

In [11]:
# as raw data file is large for github, download data to local
# adjust the path to your local path
path = "../00_Source_Data/arcos_all_washpost.tsv"
csv_chunk = pd.read_table(path, chunksize=50_000, low_memory=False)

In [12]:
Florida = []
for i, chunk in enumerate(csv_chunk):
    append_chunk = chunk.loc[chunk["BUYER_STATE"] == "FL"]
    Florida.append(append_chunk)
data_selected_Florida = pd.concat(Florida)

### Concat three parquet file

In [13]:
pdList = [
    data_selected_Washington,
    data_selected_Texas,
    data_selected_Florida,
]  # List of your dataframes
df = pd.concat(pdList)

#### Subset, calculate dosage, and transform data

In [14]:
subset_df = df[
    [
        "BUYER_STATE",
        "BUYER_COUNTY",
        "TRANSACTION_DATE",
        "CALC_BASE_WT_IN_GM",
        "DOSAGE_UNIT",
    ]
]

#### Create year variable

In [15]:
# Convert the 'transaction_date' column to datetime format
subset_df["TRANSACTION_DATE"] = pd.to_datetime(subset_df["TRANSACTION_DATE"])

# Extract the year and create a new column 'transaction_year'
subset_df["transaction_year"] = subset_df["TRANSACTION_DATE"].dt.year

In [16]:
subset_df.drop(columns=["TRANSACTION_DATE"], inplace=True)

#### Create dosage total

In [17]:
subset_df["DOSAGE_TOTAL"] = subset_df["CALC_BASE_WT_IN_GM"] * subset_df["DOSAGE_UNIT"]

In [18]:
dosage = subset_df.drop(["CALC_BASE_WT_IN_GM", "DOSAGE_UNIT"], axis=1)

In [19]:
dosage["DOSAGE_TOTAL"] = dosage.groupby(
    ["BUYER_STATE", "BUYER_COUNTY", "transaction_year"]
)["DOSAGE_TOTAL"].transform("sum")

In [20]:
dosage = dosage.drop_duplicates()

In [21]:
dosage["DOSAGE_TOTAL"].describe()

count    4.641000e+03
mean     7.604927e+07
std      4.286634e+08
min      1.059450e+00
25%      1.282726e+06
50%      5.744658e+06
75%      3.028602e+07
max      9.717061e+09
Name: DOSAGE_TOTAL, dtype: float64

In [22]:
dosage["transaction_year"].value_counts()

transaction_year
2012    335
2006    335
2013    333
2007    333
2011    332
2014    332
2019    331
2016    331
2018    331
2009    331
2010    331
2015    330
2017    330
2008    328
Name: count, dtype: int64

#### write to parquet file

In [23]:
dosage.to_parquet("../20_Intermediate_Files/Dosage.parquet")