In [1]:
import polars as pl

In [2]:
fact_sales = pl.read_parquet("Data/Parquet/star_schema_daily/fact_sales.parquet")
dim_calendar = pl.read_parquet("Data/Parquet/star_schema_daily/dim_calendar.parquet")
dim_store = pl.read_parquet("Data/Parquet/star_schema_daily/dim_store.parquet")
dim_item = pl.read_parquet("Data/Parquet/star_schema_daily/dim_item.parquet")
dim_state = pl.read_parquet("Data/Parquet/star_schema_daily/dim_state.parquet")


In [3]:
print("fact_sales: ",fact_sales.head())
print("dim_calendar: ",dim_calendar.head())
print("dim_store: ",dim_store.head())
print("dim_item: ",dim_item.head())
print("dim_state: ",dim_state.head())

fact_sales:  shape: (5, 8)
┌──────────┬────────────┬───────────────┬──────────┬──────────┬────────────┬───────────┬─────────┐
│ sales_id ┆ date_id    ┆ item_id       ┆ store_id ┆ state_id ┆ units_sold ┆ avg_price ┆ revenue │
│ ---      ┆ ---        ┆ ---           ┆ ---      ┆ ---      ┆ ---        ┆ ---       ┆ ---     │
│ i64      ┆ str        ┆ str           ┆ str      ┆ str      ┆ i64        ┆ f64       ┆ f64     │
╞══════════╪════════════╪═══════════════╪══════════╪══════════╪════════════╪═══════════╪═════════╡
│ 1        ┆ 2011-01-29 ┆ HOBBIES_1_001 ┆ CA_1     ┆ CA       ┆ 0          ┆ 8.29      ┆ 0.0     │
│ 2        ┆ 2011-01-29 ┆ HOBBIES_1_002 ┆ CA_1     ┆ CA       ┆ 0          ┆ 3.97      ┆ 0.0     │
│ 3        ┆ 2011-01-29 ┆ HOBBIES_1_003 ┆ CA_1     ┆ CA       ┆ 0          ┆ 2.97      ┆ 0.0     │
│ 4        ┆ 2011-01-29 ┆ HOBBIES_1_004 ┆ CA_1     ┆ CA       ┆ 0          ┆ 4.53      ┆ 0.0     │
│ 5        ┆ 2011-01-29 ┆ HOBBIES_1_005 ┆ CA_1     ┆ CA       ┆ 0          ┆ 2.94 

In [4]:
daily_level_q = (
    fact_sales
    .lazy()
    .join(dim_calendar.lazy(), on="date_id", how="left")
    .select([
        "date_id", "day_of_month", "day_name", "month", "year",
        "units_sold", "revenue", "item_id", "store_id"
    ])
    .group_by(["date_id", "day_of_month", "day_name", "month", "year"])
    .agg([
        pl.col("units_sold").sum().alias("total_units_sold"),
        pl.col("revenue").sum().alias("total_revenue"),
        pl.col("item_id").n_unique().alias("num_items_sold"),
        pl.col("store_id").n_unique().alias("num_stores"),
        (pl.col("revenue").sum() / pl.col("units_sold").sum()).alias("avg_price_per_unit"),
    ])
    .sort("date_id")
)

daily_level = daily_level_q.collect()
print(f"Daily Level: {daily_level.height:,} hàng | Memory: {daily_level.estimated_size() / (1024**2):.2f} MB")


Daily Level: 1,913 hàng | Memory: 0.13 MB


In [5]:
item_level_q = (
    fact_sales
    .lazy()
    .join(
        dim_item.lazy().select(["item_id", "item_name", "dept_id", "cat_id", "price", "currency"]),
        on="item_id",
        how="left"
    )
    .select([
        "item_id", "item_name", "dept_id", "cat_id", "price", "currency",
        "units_sold", "revenue", "store_id", "date_id"
    ])
    .group_by(["item_id", "item_name", "dept_id", "cat_id", "price", "currency"])
    .agg([
        pl.col("units_sold").sum().alias("total_units_sold"),
        pl.col("revenue").sum().alias("total_revenue"),
        pl.col("store_id").n_unique().alias("num_stores_sold"),
        pl.col("date_id").n_unique().alias("num_days_sold"),
        (pl.col("revenue").sum() / pl.col("units_sold").sum()).alias("avg_price"),
    ])
    .sort("total_revenue", descending=True)
)

item_level = item_level_q.collect()
print(f"Item Level: {item_level.height:,} hàng | Memory: {item_level.estimated_size() / (1024**2):.2f} MB")


Item Level: 3,049 hàng | Memory: 0.24 MB


In [6]:
store_level_q = (
    fact_sales
    .lazy()
    .join(
        dim_store.lazy().select(["store_id", "store_name", "state_id", "type", "size"]),
        on="store_id",
        how="left"
    )
    .join(
        dim_state.lazy().select(["state_id", "state_name", "region"]),
        on="state_id",
        how="left"
    )
    .select([
        "store_id", "store_name", "state_id", "state_name",
        "region", "type", "size",
        "units_sold", "revenue", "item_id", "date_id"
    ])
    .group_by(["store_id", "store_name", "state_id", "state_name", "region", "type", "size"])
    .agg([
        pl.col("units_sold").sum().alias("total_units_sold"),
        pl.col("revenue").sum().alias("total_revenue"),
        pl.col("item_id").n_unique().alias("num_items_sold"),
        pl.col("date_id").n_unique().alias("num_days_sold"),
        (pl.col("revenue").sum() / pl.col("units_sold").sum()).alias("avg_price"),
    ])
    .sort("total_revenue", descending=True)
)

store_level = store_level_q.collect()
print(f"Store Level: {store_level.height:,} hàng | Memory: {store_level.estimated_size() / (1024**2):.2f} MB")


Store Level: 10 hàng | Memory: 0.00 MB


In [7]:
print(daily_level.sort("total_revenue", descending=True).head(5))

shape: (5, 10)
┌────────────┬────────────┬──────────┬───────┬───┬────────────┬────────────┬───────────┬───────────┐
│ date_id    ┆ day_of_mon ┆ day_name ┆ month ┆ … ┆ total_reve ┆ num_items_ ┆ num_store ┆ avg_price │
│ ---        ┆ th         ┆ ---      ┆ ---   ┆   ┆ nue        ┆ sold       ┆ s         ┆ _per_unit │
│ str        ┆ ---        ┆ str      ┆ i64   ┆   ┆ ---        ┆ ---        ┆ ---       ┆ ---       │
│            ┆ str        ┆          ┆       ┆   ┆ f64        ┆ u32        ┆ u32       ┆ f64       │
╞════════════╪════════════╪══════════╪═══════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 2016-03-06 ┆ d_1864     ┆ Tuesday  ┆ 3     ┆ … ┆ 179871.47  ┆ 3049       ┆ 10        ┆ 3.143617  │
│ 2016-04-03 ┆ d_1892     ┆ Tuesday  ┆ 4     ┆ … ┆ 179257.94  ┆ 3049       ┆ 10        ┆ 3.176924  │
│ 2016-04-09 ┆ d_1898     ┆ Monday   ┆ 4     ┆ … ┆ 175901.83  ┆ 3049       ┆ 10        ┆ 3.122148  │
│ 2016-02-06 ┆ d_1835     ┆ Monday   ┆ 2     ┆ … ┆ 172905.08  ┆ 3049       ┆

In [8]:
item_level.head(5)

item_id,item_name,dept_id,cat_id,price,currency,total_units_sold,total_revenue,num_stores_sold,num_days_sold,avg_price
str,str,str,str,f64,str,i64,f64,u32,u32,f64
"""FOODS_3_586""","""FOODS_3_586""","""FOODS_3""","""FOODS""",1.6,"""USD""",920242,1472400.0,10,1913,1.6
"""FOODS_3_120""","""FOODS_3_120""","""FOODS_3""","""FOODS""",4.98,"""USD""",283104,1409700.0,10,1913,4.979281
"""FOODS_3_090""","""FOODS_3_090""","""FOODS_3""","""FOODS""",1.36,"""USD""",1002529,1369300.0,10,1913,1.365844
"""FOODS_3_202""","""FOODS_3_202""","""FOODS_3""","""FOODS""",4.26,"""USD""",295689,1255300.0,10,1913,4.245437
"""FOODS_3_587""","""FOODS_3_587""","""FOODS_3""","""FOODS""",2.52,"""USD""",396119,992207.52,10,1913,2.504822


In [9]:
store_level.head(5)

store_id,store_name,state_id,state_name,region,type,size,total_units_sold,total_revenue,num_items_sold,num_days_sold,avg_price
str,str,str,str,str,str,i64,i64,f64,u32,u32,f64
"""CA_3""","""CA_3""","""CA""","""California""","""West""","""regular""",10005,11188180,32068000.0,3049,1913,2.866249
"""CA_1""","""CA_1""","""CA""","""California""","""West""","""regular""",10002,7698216,22631000.0,3049,1913,2.939763
"""TX_2""","""TX_2""","""TX""","""Texas""","""South""","""regular""",10003,7214384,20626000.0,3049,1913,2.859013
"""TX_3""","""TX_3""","""TX""","""Texas""","""South""","""regular""",10008,6089330,17897000.0,3049,1913,2.939112
"""WI_2""","""WI_2""","""WI""","""Wisconsin""","""Midwest""","""regular""",10004,6544012,17767000.0,3049,1913,2.714969


In [10]:
print(f"  • Số ngày: {daily_level.height}")
print(f"  • Tổng units: {daily_level['total_units_sold'].sum():,.0f}")
print(f"  • Tổng doanh thu: ${daily_level['total_revenue'].sum():,.2f}")
print(f"  • Trung bình/ngày: ${daily_level['total_revenue'].mean():,.2f}")

  • Số ngày: 1913
  • Tổng units: 65,695,409
  • Tổng doanh thu: $188,416,761.55
  • Trung bình/ngày: $98,492.82


In [11]:
print(f"  • Số sản phẩm: {item_level.height}")
print(f"  • Tổng units: {item_level['total_units_sold'].sum():,.0f}")
print(f"  • Tổng doanh thu: ${item_level['total_revenue'].sum():,.2f}")
print(f"  • Trung bình/sản phẩm: ${item_level['total_revenue'].mean():,.2f}")


  • Số sản phẩm: 3049
  • Tổng units: 65,695,409
  • Tổng doanh thu: $188,416,761.55
  • Trung bình/sản phẩm: $61,796.25


In [12]:
print(f"  • Số cửa hàng: {store_level.height}")
print(f"  • Tổng units: {store_level['total_units_sold'].sum():,.0f}")
print(f"  • Tổng doanh thu: ${store_level['total_revenue'].sum():,.2f}")
print(f"  • Trung bình/cửa hàng: ${store_level['total_revenue'].mean():,.2f}")


  • Số cửa hàng: 10
  • Tổng units: 65,695,409
  • Tổng doanh thu: $188,416,761.55
  • Trung bình/cửa hàng: $18,841,676.16


In [13]:
output_dir = "Data/Parquet/test"

In [14]:
try:
    # Parquet
    daily_level.write_parquet(f"{output_dir}/agg_daily_level.parquet")
    item_level.write_parquet(f"{output_dir}/agg_item_level.parquet")
    store_level.write_parquet(f"{output_dir}/agg_store_level.parquet")

    # CSV
    daily_level.write_csv(f"{output_dir}/agg_daily_level.csv")
    item_level.write_csv(f"{output_dir}/agg_item_level.csv")
    store_level.write_csv(f"{output_dir}/agg_store_level.csv")

    print("Đã lưu Parquet & CSV")
except Exception as e:
    print(f"Lỗi lưu file: {e}")

Đã lưu Parquet & CSV
