In [10]:
# ==========================
# 1. Import & Kết nối DB
# ==========================
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường (.env)
load_dotenv()

DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_SILVER = os.getenv("DB_SILVER")
DB_GOLD   = os.getenv("DB_GOLD")

# Engine kết nối tới Silver
engine_silver = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}"
)

engine_gold = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_GOLD}"
)


In [2]:
# ==========================
# 2. Load dữ liệu từ bảng fact_orders (Silver)
# ==========================

query = "SELECT * FROM fact_orders"
df_orders = pd.read_sql(query, engine_silver)

print("Shape:", df_orders.shape)
df_orders.head(5)


Shape: (40236, 28)


Unnamed: 0,order_id,shop_id,page_id,customer_id,warehouse_id,status,status_name,inserted_at,updated_at,time_assign_seller,...,charged_by_card,charged_by_qrpay,exchange_payment,prepaid,money_to_collect,shipping_fee,partner_fee,surcharge,fee_marketplace,tax
0,40616,230361475,377626778776391,e906c5c7-ea19-42e5-a971-57cf09c94417,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,0,new,2025-08-16 02:19:59,2025-08-16 02:19:59,NaT,...,0,0,0,0,0,0,0,0.0,0,0
1,40615,230361475,377626778776391,6c5b40d2-ae42-4e13-8019-f3f9127d75a0,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,0,new,2025-08-15 12:56:51,2025-08-15 12:56:51,NaT,...,0,0,0,0,0,0,0,0.0,0,0
2,40614,230361475,377626778776391,765c531b-743f-4f30-b6b0-0d6f3579630f,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,0,new,2025-08-15 12:45:17,2025-08-15 12:45:18,NaT,...,0,0,0,0,0,0,0,0.0,0,0
3,40613,230361475,377626778776391,f169b9a8-9a87-4c74-a296-ba9b505c63a6,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,0,new,2025-08-15 12:04:39,2025-08-15 12:04:39,NaT,...,0,0,0,0,0,0,0,0.0,0,0
4,40612,230361475,377626778776391,dc9b8f02-48b1-448f-a9bd-7e5ee2bdfa06,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,0,new,2025-08-15 11:43:06,2025-08-15 11:43:06,NaT,...,0,0,0,0,0,0,0,0.0,0,0


In [3]:
# ==========================
# 3. Khám phá dữ liệu sơ bộ
# ==========================
df_orders.info()
df_orders.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40236 entries, 0 to 40235
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   order_id                        40236 non-null  object        
 1   shop_id                         40236 non-null  int64         
 2   page_id                         40226 non-null  object        
 3   customer_id                     40236 non-null  object        
 4   warehouse_id                    40236 non-null  object        
 5   status                          40236 non-null  int64         
 6   status_name                     40236 non-null  object        
 7   inserted_at                     40236 non-null  datetime64[ns]
 8   updated_at                      40236 non-null  datetime64[ns]
 9   time_assign_seller              36574 non-null  datetime64[ns]
 10  total_quantity                  40236 non-null  int64         
 11  to

Unnamed: 0,order_id,shop_id,page_id,customer_id,warehouse_id,status,status_name,inserted_at,updated_at,time_assign_seller,...,charged_by_card,charged_by_qrpay,exchange_payment,prepaid,money_to_collect,shipping_fee,partner_fee,surcharge,fee_marketplace,tax
count,40236.0,40236.0,40226.0,40236,40236,40236.0,40236,40236,40236,36574,...,40236.0,40236.0,40236.0,40236.0,40236.0,40236.0,40236.0,40235.0,40236.0,40236.0
unique,40236.0,,73.0,36080,24,,7,,,,...,,,,,,,,,,
top,40616.0,,324518484071667.0,4c284a73-7139-44dc-8901-60aab7b46cc7,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,,delivered,,,,...,,,,,,,,,,
freq,1.0,,5637.0,11,40128,,17276,,,,...,,,,,,,,,,
mean,,230361475.0,,,,3.822423,,2024-08-12 21:10:24.156153600,2024-08-27 13:58:51.059250432,2024-08-16 03:49:43.141822208,...,0.0,0.0,0.0,508.9721,411218.4,6387.617,13296.518491,0.0,0.0,0.0
min,,230361475.0,,,,0.0,,2021-12-30 03:13:15,2024-02-05 01:03:13,2024-02-05 08:53:54,...,0.0,0.0,0.0,0.0,-1788000.0,-100.0,0.0,0.0,0.0,0.0
25%,,230361475.0,,,,3.0,,2024-05-05 10:16:39.750000128,2024-05-15 02:33:27.750000128,2024-05-05 08:30:58.249999872,...,0.0,0.0,0.0,0.0,289000.0,0.0,15000.0,0.0,0.0,0.0
50%,,230361475.0,,,,3.0,,2024-07-19 03:08:48,2024-07-28 19:08:18.500000,2024-07-23 02:29:20,...,0.0,0.0,0.0,0.0,379000.0,0.0,16000.0,0.0,0.0,0.0
75%,,230361475.0,,,,5.0,,2024-12-04 22:28:58.750000128,2024-12-19 01:42:16,2024-12-07 14:40:38.750000128,...,0.0,0.0,0.0,0.0,609000.0,20000.0,17000.0,0.0,0.0,0.0
max,,230361475.0,,,,15.0,,2025-08-16 02:19:59,2025-08-18 07:32:36,2025-04-17 05:29:32,...,0.0,0.0,0.0,2000000.0,6800000.0,2220000.0,68500.0,0.0,0.0,0.0


In [8]:
# ==========================
# Transform fact_orders → gold_fact_orders
# ==========================
def transform_orders(df):
    # B1. Chuẩn hóa order_date_id
    df['order_date_id'] = pd.to_datetime(df['inserted_at']).dt.strftime('%Y%m%d').astype(int)

    # B3. Doanh thu gộp & ròng
    df['gross_revenue'] = df['total_price_after_sub_discount']
    df['net_revenue'] = (
        df['total_price_after_sub_discount']
        - df['shipping_fee']
        - df['partner_fee']
    )

    # B4. Gom phương thức thanh toán
    df['payment_method'] = df['cod'].apply(lambda x: 'COD' if x > 0 else 'Other')

    # B5. Giữ lại các cột cần thiết
    keep_cols = [
        'order_id', 'shop_id', 'page_id', 'customer_id', 'warehouse_id',
        'order_date_id', 'status', 'status_name',
        'total_quantity', 'gross_revenue', 'net_revenue',
        'shipping_fee', 'partner_fee', 'payment_method',
        'inserted_at', 'updated_at'
    ]
    df_gold = df[keep_cols]

    return df_gold


In [9]:
df_gold_orders = transform_orders(df_orders)
print("Shape:", df_gold_orders.shape)
df_gold_orders.head(5)

Shape: (40236, 16)


Unnamed: 0,order_id,shop_id,page_id,customer_id,warehouse_id,order_date_id,status,status_name,total_quantity,gross_revenue,net_revenue,shipping_fee,partner_fee,payment_method,inserted_at,updated_at
0,40616,230361475,377626778776391,e906c5c7-ea19-42e5-a971-57cf09c94417,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,20250816,0,new,0,0,0,0,0,Other,2025-08-16 02:19:59,2025-08-16 02:19:59
1,40615,230361475,377626778776391,6c5b40d2-ae42-4e13-8019-f3f9127d75a0,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,20250815,0,new,0,0,0,0,0,Other,2025-08-15 12:56:51,2025-08-15 12:56:51
2,40614,230361475,377626778776391,765c531b-743f-4f30-b6b0-0d6f3579630f,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,20250815,0,new,0,0,0,0,0,Other,2025-08-15 12:45:17,2025-08-15 12:45:18
3,40613,230361475,377626778776391,f169b9a8-9a87-4c74-a296-ba9b505c63a6,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,20250815,0,new,0,0,0,0,0,Other,2025-08-15 12:04:39,2025-08-15 12:04:39
4,40612,230361475,377626778776391,dc9b8f02-48b1-448f-a9bd-7e5ee2bdfa06,5f7aaa7d-ae3c-48b6-8f0e-04207afae074,20250815,0,new,0,0,0,0,0,Other,2025-08-15 11:43:06,2025-08-15 11:43:06


In [11]:
df_gold_orders.to_sql(
    "gold_fact_orders",
    engine_gold,
    if_exists="replace",
    index=False
)
print("✅ Đã load gold_fact_orders vào schema Gold.")


✅ Đã load gold_fact_orders vào schema Gold.
