In [1]:
import polars as pl
from skimpy import clean_columns, skim

# Connection URI for ConnectorX + Polars
uri = "postgresql://postgres:nttc4@localhost:5432/postgres"

# Tables to summarize
tables = ["items", "keywords", "merchants", "transaction_data", "transaction_items"]

for tbl in tables:
    # 1. Read directly into Polars via ConnectorX
    df = pl.read_database_uri(uri=uri, query=f"SELECT * FROM {tbl};")

    # 2. Clean column names and skim
    clean_df = clean_columns(df)
    print(f"\n\n=== SKIMPY SUMMARY FOR TABLE: {tbl} ===\n")
    skim(clean_df)

    # 3. Exact‐duplicate rows check
    total_rows  = clean_df.height
    unique_rows = clean_df.unique().height
    dupes       = total_rows - unique_rows
    print(f">>> Found {dupes} exact duplicate rows ({dupes/total_rows:.2%} of {total_rows})\n")

    # 4. Show first 10 rows
    print(f"=== FIRST 10 ROWS OF `{tbl}` ===")
    print(df.head(10))




=== SKIMPY SUMMARY FOR TABLE: items ===



>>> Found 0 exact duplicate rows (0.00% of 400)

=== FIRST 10 ROWS OF `items` ===
shape: (10, 5)
┌─────────┬───────────────┬────────────────────────────┬────────────┬─────────────┐
│ item_id ┆ cuisine_tag   ┆ item_name                  ┆ item_price ┆ merchant_id │
│ ---     ┆ ---           ┆ ---                        ┆ ---        ┆ ---         │
│ i32     ┆ str           ┆ str                        ┆ f64        ┆ str         │
╞═════════╪═══════════════╪════════════════════════════╪════════════╪═════════════╡
│ 1       ┆ Side          ┆ Fried Spring Rolls         ┆ 3.5        ┆ 1d4f2       │
│ 2       ┆ American      ┆ Lemon Pepper Wings (6 pcs) ┆ 7.75       ┆ 8d5f9       │
│ 3       ┆ Asian         ┆ General Tso’s Chicken      ┆ 9.5        ┆ 1a3f7       │
│ 4       ┆ American      ┆ Bacon Wrapped Dog          ┆ 5.25       ┆ d3b6a       │
│ 5       ┆ Dessert       ┆ Kulfi                      ┆ 3.75       ┆ a3e8b       │
│ 6       ┆ Seafood       ┆ Fried Fish Sandwich        ┆ 8.5   

>>> Found 0 exact duplicate rows (0.00% of 4477)

=== FIRST 10 ROWS OF `keywords` ===
shape: (10, 6)
┌─────┬─────────────────────┬───────┬───────┬──────────┬─────────────┐
│ id  ┆ keyword             ┆ view  ┆ menu  ┆ checkout ┆ order_count │
│ --- ┆ ---                 ┆ ---   ┆ ---   ┆ ---      ┆ ---         │
│ i32 ┆ str                 ┆ i32   ┆ i32   ┆ i32      ┆ i32         │
╞═════╪═════════════════════╪═══════╪═══════╪══════════╪═════════════╡
│ 0   ┆ fried spring rolls  ┆ 37816 ┆ 11185 ┆ 3882     ┆ 153         │
│ 1   ┆ spring rolls        ┆ 8977  ┆ 2903  ┆ 1453     ┆ 143         │
│ 2   ┆ crispy spring rolls ┆ 11922 ┆ 3604  ┆ 1627     ┆ 147         │
│ 3   ┆ egg rolls           ┆ 9587  ┆ 1879  ┆ 1130     ┆ 75          │
│ 4   ┆ fried lumpia        ┆ 32437 ┆ 9716  ┆ 3036     ┆ 233         │
│ 5   ┆ popiah goreng       ┆ 35553 ┆ 12272 ┆ 7363     ┆ 269         │
│ 6   ┆ lumpia              ┆ 25878 ┆ 5232  ┆ 3478     ┆ 338         │
│ 7   ┆ lumpia goreng       ┆ 29424 ┆ 6052  ┆ 3

>>> Found 0 exact duplicate rows (0.00% of 100)

=== FIRST 10 ROWS OF `merchants` ===
shape: (10, 4)
┌─────────────┬─────────────────┬─────────┬────────────┐
│ merchant_id ┆ merchant_name   ┆ city_id ┆ join_date  │
│ ---         ┆ ---             ┆ ---     ┆ ---        │
│ str         ┆ str             ┆ i32     ┆ date       │
╞═════════════╪═════════════════╪═════════╪════════════╡
│ 3e2b6       ┆ Bagel Bros      ┆ 8       ┆ 2015-12-14 │
│ 2b5d7       ┆ Bagel House     ┆ 4       ┆ 2017-08-16 │
│ e8c2b       ┆ BBQ Nation      ┆ 5       ┆ 2020-05-14 │
│ f8a3d       ┆ BBQ Spot        ┆ 2       ┆ 2019-03-17 │
│ 4e2d1       ┆ Burger Bliss    ┆ 5       ┆ 2015-05-26 │
│ 5c1f8       ┆ Burger Factory  ┆ 5       ┆ 2019-07-29 │
│ 0c5d8       ┆ Burger Joint    ┆ 8       ┆ 2018-05-12 │
│ 9f4a2       ┆ Burrito Express ┆ 8       ┆ 2013-04-22 │
│ 0e1b3       ┆ Chicken Shack   ┆ 2       ┆ 2016-09-27 │
│ 9d5f2       ┆ Chili Express   ┆ 2       ┆ 2018-05-23 │
└─────────────┴─────────────────┴─────────┴─

>>> Found 0 exact duplicate rows (0.00% of 999993)

=== FIRST 10 ROWS OF `transaction_data` ===
shape: (10, 9)
┌─────┬───────────┬────────────┬────────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ id  ┆ order_id  ┆ order_time ┆ driver_arr ┆ … ┆ delivery_t ┆ order_valu ┆ eater_id   ┆ merchant_ │
│ --- ┆ ---       ┆ ---        ┆ ival_time  ┆   ┆ ime        ┆ e          ┆ ---        ┆ id        │
│ i32 ┆ str       ┆ datetime[μ ┆ ---        ┆   ┆ ---        ┆ ---        ┆ i64        ┆ ---       │
│     ┆           ┆ s]         ┆ datetime[μ ┆   ┆ datetime[μ ┆ f64        ┆            ┆ str       │
│     ┆           ┆            ┆ s]         ┆   ┆ s]         ┆            ┆            ┆           │
╞═════╪═══════════╪════════════╪════════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ 0   ┆ 46975df1c ┆ 2023-11-07 ┆ 2023-11-07 ┆ … ┆ 2023-11-07 ┆ 10.46      ┆ 3411548704 ┆ 2a1c4     │
│     ┆           ┆ 09:49:00   ┆ 09:58:00   ┆   ┆ 10:34:00   ┆            ┆      

>>> Found 0 exact duplicate rows (0.00% of 2999773)

=== FIRST 10 ROWS OF `transaction_items` ===
shape: (10, 3)
┌─────┬───────────┬─────────┐
│ id  ┆ order_id  ┆ item_id │
│ --- ┆ ---       ┆ ---     │
│ i32 ┆ str       ┆ i32     │
╞═════╪═══════════╪═════════╡
│ 0   ┆ 46975df1c ┆ 10      │
│ 1   ┆ 7623ebe9d ┆ 101     │
│ 2   ┆ 7623ebe9d ┆ 56      │
│ 3   ┆ 7623ebe9d ┆ 44      │
│ 4   ┆ 01c4be3f6 ┆ 36      │
│ 5   ┆ 01c4be3f6 ┆ 173     │
│ 6   ┆ 01c4be3f6 ┆ 12      │
│ 7   ┆ 01c4be3f6 ┆ 89      │
│ 8   ┆ 01c4be3f6 ┆ 206     │
│ 9   ┆ dd02ff3da ┆ 230     │
└─────┴───────────┴─────────┘


In [2]:
df = pl.read_database_uri(uri=uri, query="""
  SELECT
    ti.order_id,
    td.merchant_id           AS order_merchant_id,
    ti.item_id,
    i.item_name,
    i.item_price,
    ti.quantity,
    (i.item_price * ti.quantity)::numeric(10,2) AS subtotal,
    i.merchant_id            AS item_merchant_id,
    m.merchant_name,
    td.order_value,
    td.order_time,
    td.driver_arrival_time,
    td.driver_pickup_time,
    td.delivery_time
  FROM new_transaction_items ti
  JOIN transaction_data td ON ti.order_id = td.order_id
  JOIN items i             ON ti.item_id = i.item_id
  JOIN merchants m         ON i.merchant_id = m.merchant_id
  ORDER BY td.id
""")
df.write_csv("combined_orders.csv")
