In [None]:
import pandas as pd
import argparse
import json
import duckdb
import pyarrow.parquet as pq

In [None]:
input_prefix="../data/intermediate/scratch/ccw_proxy"
output_prefix="../data/output/ccw_proxy/ccw_proxy"
ccw_json="../data/input/remote_data/ccw.json"
ref_year=2005
dw_bene_prefix="../data/input/local_data/data_warehouse/dw_bene_xu_sabath_00_16/bene"

# Output

* Merge intermediate files

In [None]:
pq.read_table(f"{input_prefix}_copd_{ref_year}.parquet").column_names

In [None]:
conn = duckdb.connect()

df = conn.execute(f"""
    SELECT *
    FROM '{input_prefix}_ami_2000.parquet'
"""
).fetch_df()
df.head()

## Aproach registering tables

In [None]:
# Connect to a DuckDB in-memory database
con_ccw = duckdb.connect()

# Iterate over the conditions and register tables in DuckDB
for idx, condition in enumerate(['anemia', 'ami', 'asthma']):
    file = f"{input_prefix}_{condition}_2000.parquet"
    table = pq.read_table(file)
    df = table.to_pandas()
    df.drop(columns="rfrnc_yr", inplace=True)
    df.reset_index(inplace=True)
    # Create a temporary table in DuckDB
    con_ccw.register(f'temp_table_{idx}', df)

# Query the temporary tables
for idx in range(len(['anemia', 'ami', 'asthma'])):
    df = con_ccw.execute(f'SELECT * FROM temp_table_{idx} LIMIT 100').fetch_df()
    print(df.head())

In [None]:
# Define the initial SQL query using the first temporary table as the base table
query = f'SELECT *, 2000 AS rfrnc_yr FROM temp_table_0'

# Iterate over the remaining temporary tables and perform left joins
for idx in range(1, len(['anemia', 'ami', 'asthma']) ):
    # Define the join condition
    join_condition = f'USING(bene_id)'

    # Update the query with the left join
    query = f'{query} LEFT JOIN temp_table_{idx} {join_condition}'

print(query)

In [None]:
# Execute the final query and fetch the result into a DuckDB DataFrame
result = con_ccw.execute(query).fetchdf()

# Print the result
print(result)

In [None]:
result[result.anemia == 1]

In [None]:
# Close the connection to the DuckDB database
con_ccw.close()

## Approach querying parquets directly

In [None]:
with open(ccw_json, 'r') as json_file:
        ccw_dict = json.load(json_file)

conditions_list = list(ccw_dict.keys())
conditions_list.remove('stroke')
conditions_list

In [None]:
files_list = [f"{input_prefix}_{c_}_2000.parquet" for c_ in conditions_list[:2]]
files_list

In [None]:
# Define the initial SQL query using the first table as the base table
query = f"SELECT *, 2000 AS rfrnc_yr FROM '{files_list[0]}'"

# Iterate over the remaining tables and perform left joins
for i_ in range(1, len(files_list) ):
    # Update the query with the left join
    query = f"{query} LEFT JOIN '{files_list[i_]}' USING(bene_id)"

print(query)

* it is harder with this approach to remove rfrnc_yr from each table

In [None]:
# Connect to a DuckDB in-memory database
con_ccw = duckdb.connect()

# Execute the final query and fetch the result into a DuckDB DataFrame
result = con_ccw.execute(query).fetchdf()

# Print the result

## eda output

In [None]:
pq.read_table("../src/ccw_proxy_2016.parquet").column_names

In [None]:
conn = duckdb.connect()

conn.execute(f"""
    SELECT 
        anemia, 
        count(*) as n
    FROM '../src/ccw_proxy_2005.parquet'
    GROUP BY anemia
"""
).fetch_df()