In [2]:
import duckdb

In [12]:
import duckdb

# connect to your db
con = duckdb.connect("../db/synpuf.duckdb")

# List all tables in the DB
tables = con.execute("SHOW TABLES").fetchdf()
print(tables)

# If you want to see schema (columns + types) of a table, e.g. 'carrier':
schema = con.execute("DESCRIBE carrier").fetchdf()
print(schema)

con.close()


                 name
0    beneficiary_2008
1    beneficiary_2009
2    beneficiary_2010
3     beneficiary_all
4             carrier
5         carrier_agg
6        chronic_2008
7        chronic_2009
8        chronic_2010
9   combined_features
10          inpatient
11         outpatient
12                pde
              column_name column_type null   key default extra
0             DESYNPUF_ID     VARCHAR  YES  None    None  None
1                  CLM_ID      BIGINT  YES  None    None  None
2             CLM_FROM_DT      BIGINT  YES  None    None  None
3             CLM_THRU_DT      BIGINT  YES  None    None  None
4          ICD9_DGNS_CD_1     VARCHAR  YES  None    None  None
..                    ...         ...  ...   ...     ...   ...
137   LINE_ICD9_DGNS_CD_9     VARCHAR  YES  None    None  None
138  LINE_ICD9_DGNS_CD_10     VARCHAR  YES  None    None  None
139  LINE_ICD9_DGNS_CD_11     VARCHAR  YES  None    None  None
140  LINE_ICD9_DGNS_CD_12     VARCHAR  YES  None    None  None

In [3]:
duckdb.sql("SELECT 'Hello, World!' as message").show()

┌───────────────┐
│    message    │
│    varchar    │
├───────────────┤
│ Hello, World! │
└───────────────┘



In [11]:
import duckdb

# connect
con = duckdb.connect("../db/synpuf.duckdb")

# 1. Chronic disease count (1 = disease, 2 = no disease → so we count only '1')
chronic_cols = [
    "SP_ALZHDMTA","SP_CHF","SP_CHRNKIDN","SP_CNCR","SP_COPD",
    "SP_DEPRESSN","SP_DIABETES","SP_ISCHMCHT","SP_OSTEOPRS",
    "SP_RA_OA","SP_STRKETIA"
]

chronic_expr = " + ".join([f"(CASE WHEN {c}=1 THEN 1 ELSE 0 END)" for c in chronic_cols])

for year in ["2008","2009","2010"]:
    con.execute(f"""
    CREATE OR REPLACE TABLE chronic_{year} AS
    SELECT DESYNPUF_ID,
           {chronic_expr} AS chronic_count_{year}
    FROM beneficiary_{year}
    """)

# 2. Aggregate carrier claims
con.execute("""
CREATE OR REPLACE TABLE carrier_agg AS
SELECT DESYNPUF_ID,
       COUNT(DISTINCT CLM_ID) AS total_visits,
       (
         SUM(COALESCE(LINE_NCH_PMT_AMT_1,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_2,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_3,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_4,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_5,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_6,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_7,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_8,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_9,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_10,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_11,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_12,0)) +
         SUM(COALESCE(LINE_NCH_PMT_AMT_13,0))
       ) AS total_amount,
       (
         AVG(COALESCE(LINE_NCH_PMT_AMT_1,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_2,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_3,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_4,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_5,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_6,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_7,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_8,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_9,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_10,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_11,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_12,0)) +
         AVG(COALESCE(LINE_NCH_PMT_AMT_13,0))
       ) AS avg_claim_amount
FROM carrier
GROUP BY DESYNPUF_ID
""")

# 3. Merge into 2010 beneficiary
con.execute("""
CREATE OR REPLACE TABLE combined_features AS
SELECT s.*,
       c08.chronic_count_2008,
       c09.chronic_count_2009,
       c10.chronic_count_2010,
       COALESCE(cl.total_visits,0) AS total_visits,
       COALESCE(cl.total_amount,0) AS total_amount,
       COALESCE(cl.avg_claim_amount,0) AS avg_claim_amount
FROM beneficiary_2010 s
LEFT JOIN chronic_2008 c08 USING (DESYNPUF_ID)
LEFT JOIN chronic_2009 c09 USING (DESYNPUF_ID)
LEFT JOIN chronic_2010 c10 USING (DESYNPUF_ID)
LEFT JOIN carrier_agg cl USING (DESYNPUF_ID)
""")

# 4. Save final file
con.execute("""
COPY combined_features 
TO 'combined_features_2010.csv' (HEADER, DELIMITER ',')
""")

con.execute("""
COPY combined_features 
TO 'combined_features_2010.parquet' (FORMAT PARQUET)
""")

print("✅ Final dataset created: combined_features_2010.csv & .parquet")

con.close()


✅ Final dataset created: combined_features_2010.csv & .parquet


In [14]:
import duckdb

con = duckdb.connect("../db/synpuf.duckdb")

# Get column info
cols = con.execute("PRAGMA table_info('combined_features')").df()
print(cols)

con.close()


    cid                      name     type  notnull dflt_value     pk
0     0               DESYNPUF_ID  VARCHAR    False       None  False
1     1             BENE_BIRTH_DT   BIGINT    False       None  False
2     2             BENE_DEATH_DT   BIGINT    False       None  False
3     3         BENE_SEX_IDENT_CD   BIGINT    False       None  False
4     4              BENE_RACE_CD   BIGINT    False       None  False
5     5             BENE_ESRD_IND  VARCHAR    False       None  False
6     6             SP_STATE_CODE   BIGINT    False       None  False
7     7            BENE_COUNTY_CD   BIGINT    False       None  False
8     8   BENE_HI_CVRAGE_TOT_MONS   BIGINT    False       None  False
9     9  BENE_SMI_CVRAGE_TOT_MONS   BIGINT    False       None  False
10   10  BENE_HMO_CVRAGE_TOT_MONS   BIGINT    False       None  False
11   11         PLAN_CVRG_MOS_NUM   BIGINT    False       None  False
12   12               SP_ALZHDMTA   BIGINT    False       None  False
13   13             

In [15]:
import duckdb

con = duckdb.connect("../db/synpuf.duckdb")

row_count = con.execute("SELECT COUNT(*) FROM combined_features").fetchone()[0]
print("Total rows:", row_count)

con.close()


Total rows: 112754
