In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

pd.set_option('display.max_columns', None)

In [2]:
# import gzip
# with gzip.open('01_gz_files/arcos-fl-statewide-itemized.csv.gz', 'rt', newline='') as csv_file:
#     csv_data = csv_file.read()
#     with open('florida.csv', 'wt') as out_file:
#          out_file.write(csv_data)

In [3]:

# Modify state and state abbreviation
state_name = 'ohio'
state_abv = 'OH'

In [4]:

# Chunk CSV File 
csv_file_name = '02_raw_data/' + state_name + '.csv'
chunk_size = 25000
keep_cols = ['BUYER_COUNTY', 'BUYER_STATE', 'TRANSACTION_DATE', 'CALC_BASE_WT_IN_GM', 'MME_Conversion_Factor']
filtered_data = []
state = [state_abv]

with open(csv_file_name, 'r') as csv_file:
    # Read the CSV file in chunks
    csv_reader = pd.read_csv(csv_file, chunksize=chunk_size, usecols=keep_cols, low_memory=False)

    # Iterate through chunks of the CSV file
    for i, chunk in enumerate(csv_reader):
        # Filter data for specific states
        mod_chunk = chunk.loc[chunk['BUYER_STATE'].isin(state)]
        
        # Convert TRANSACTION_DATE to string type
        mod_chunk['TRANSACTION_DATE'] = mod_chunk['TRANSACTION_DATE'].astype(str)

        try:
            # Split the date column and convert to integers
            mod_chunk[['TransactionYear', 'TransactionMonth', 'TransactionDay']] = mod_chunk['TRANSACTION_DATE'].str.split('-', expand=True).astype(int)
        except Exception as e:
            # Handle the error by inserting NaN values for all three date columns
            mod_chunk[['TransactionYear', 'TransactionMonth', 'TransactionDay']] = pd.NA

        # Drop the original 'TRANSACTION_DATE' and 'TransactionDay' columns
        mod_chunk.drop(['TRANSACTION_DATE', 'TransactionDay'], axis=1, inplace=True)

        # Keep data within the specified year range
        append_chunk = mod_chunk[(mod_chunk['TransactionYear'] >= 2003) & (mod_chunk['TransactionYear'] <= 2015)]
        
        # Append the processed chunk to the list
        filtered_data.append(append_chunk)

# Concatenate all processed chunks into a single DataFrame
selected_data = pd.concat(filtered_data, ignore_index=True)

# Specify the path where you want to save the Parquet file
parquet_file_path = '03_chunked/' + state_name + '.parquet'

# Convert the Pandas DataFrame to a PyArrow Table
table = pa.Table.from_pandas(selected_data)

# Write the Table to a Parquet file
pq.write_table(table, parquet_file_path)

In [5]:
# Read opioid transaction data into pandas dataframe
file_path = '03_chunked/' + state_name + '.parquet'
data = pd.read_parquet(file_path)

data['morphine_equivalent_g'] = data['CALC_BASE_WT_IN_GM']*data['MME_Conversion_Factor']

print(data.sample(5))

# Group the Texas DataFrame by specified columns and sum specified columns
data_grouped = data.groupby(['BUYER_STATE', 'BUYER_COUNTY','TransactionYear'])\
               .sum(['morphine_equivalent_g'])
data_grouped = data_grouped.reset_index()

data_grouped.drop(columns=['TransactionMonth'], inplace=True)

print(data_grouped.sample(5))

# Save grouped data to parquet file
file_name = '04_MME_WOFIPS/' + state_name + '.parquet'
data_grouped.to_parquet(file_name)



         MME_Conversion_Factor  CALC_BASE_WT_IN_GM BUYER_STATE BUYER_COUNTY  \
9180444                    1.0             1.21080          OH         ERIE   
7525272                    1.0             1.51350          OH        STARK   
3353505                    1.5             2.24125          OH     HAMILTON   
3510340                    1.5             5.37900          OH        LUCAS   
9897016                    1.5             7.17200          OH      BELMONT   

         TransactionYear  TransactionMonth  morphine_equivalent_g  
9180444             2007                 8               1.210800  
7525272             2009                12               1.513500  
3353505             2012                 6               3.361875  
3510340             2009                12               8.068500  
9897016             2009                 5              10.758000  
    BUYER_STATE BUYER_COUNTY  TransactionYear  MME_Conversion_Factor  \
813          OH       VINTON             2009

In [6]:

# Check number of counties in the state
file_path = '04_MME_WOFIPS/' + state_name + '.parquet'
data = pd.read_parquet(file_path)
data['BUYER_COUNTY'].nunique()


88

In [7]:
# Check number of counties in the state
file_path = 'old_data/ref_states/' + state_name + '.parquet'
data = pd.read_parquet(file_path)
data['BUYER_COUNTY'].nunique()

88