## 1. Import thư viện & Config

In [10]:
import os
import json
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường từ file .env
load_dotenv()

API_KEY   = os.getenv("API_KEY")
DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_BRONZE = os.getenv("DB_BRONZE")
DB_SILVER = os.getenv("DB_SILVER")

# Tạo engine kết nối đến Bronze DB và Silver DB
bronze_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_BRONZE}")
silver_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}")


## 2. Lấy dữ liệu Customers từ Bronze

In [11]:
orders_raw = pd.read_sql("SELECT * FROM orders_raw", bronze_engine)
print("Shape:", orders_raw.shape)
orders_raw.head(2)

Shape: (40236, 4)


Unnamed: 0,shop_id,order_id,raw_json,extracted_at
0,230361475,40616,"{""total_quantity"": 0, ""transfer_money"": 0, ""ch...",2025-09-26 12:01:14
1,230361475,40615,"{""total_quantity"": 0, ""transfer_money"": 0, ""ch...",2025-09-26 12:01:14


## 3. Biến đổi

In [12]:
# Parse JSON từ cột raw_json
orders_raw["raw_dict"] = orders_raw["raw_json"].apply(json.loads)

sample_order = orders_raw["raw_dict"].iloc[0]
sample_order.keys()


dict_keys(['total_quantity', 'transfer_money', 'charged_by_momo', 'shop_id', 'prepaid', 'inserted_at', 'order_sources_name', 'is_live_shopping', 'pke_mkter', 'note_image', 'status', 'ads_source', 'p_utm_id', 'order_link', 'items_length', 'total_price', 'tags', 'conversation_id', 'assigning_care', 'post_id', 'p_utm_campaign', 'advanced_platform_fee', 'note_print', 'assigning_care_id', 'sub_status', 'p_utm_content', 'bill_email', 'is_livestream', 'total_price_after_sub_discount', 'customer_needs', 'marketplace_id', 'total_discount', 'customer_pay_fee', 'id', 'warehouse_info', 'status_history', 'system_id', 'received_at_shop', 'marketer', 'p_utm_source', 'money_to_collect', 'order_sources', 'p_utm_medium', 'returned_reason', 'note', 'returned_reason_name', 'link', 'customer_referral_code', 'ad_id', 'cod', 'assigning_seller_id', 'is_free_shipping', 'is_smc', 'shipping_address', 'p_utm_term', 'page_id', 'tax', 'partner', 'link_confirm_order', 'partner_fee', 'creator', 'time_assign_seller', 

In [13]:
orders_list = []

for row in orders_raw.itertuples():
    d = row.raw_dict
    orders_list.append({
        "order_id": d.get("id"),
        "shop_id": d.get("shop_id"),
        "page_id": d.get("page_id"),
        "customer_id": d.get("customer", {}).get("customer_id") if d.get("customer") else None,
        "status": d.get("status"),
        "status_name": d.get("status_name"),
        "total_price": d.get("total_price"),
        "total_discount": d.get("total_discount"),
        "shipping_fee": d.get("shipping_fee"),
        "cod": d.get("cod"),
        "order_currency": d.get("order_currency"),
        "created_at": d.get("inserted_at"),
        "updated_at": d.get("updated_at"),
        "bill_full_name": d.get("bill_full_name"),
        "bill_phone_number": d.get("bill_phone_number"),
        "ad_id": d.get("ad_id"),
        "post_id": d.get("post_id"),
    })

orders_clean = pd.DataFrame(orders_list)

# Chuẩn datetime
for col in ["created_at", "updated_at"]:
    orders_clean[col] = pd.to_datetime(orders_clean[col], errors="coerce")

orders_clean.head(2)



Unnamed: 0,order_id,shop_id,page_id,customer_id,status,status_name,total_price,total_discount,shipping_fee,cod,order_currency,created_at,updated_at,bill_full_name,bill_phone_number,ad_id,post_id
0,40616,230361475,377626778776391,c7a583d6-392d-4308-a02b-67c001ee03db,0,new,0,0,0,0,VND,2025-08-16 02:19:59.079610,2025-08-16 02:19:59.339020,Thinh Bui,903693389,120231628597870704,377626778776391_122189613242517872
1,40615,230361475,377626778776391,590ba7de-e4fe-43b3-8013-27fc69937edf,0,new,0,0,0,0,VND,2025-08-15 12:56:50.861007,2025-08-15 12:56:50.960894,Truong Minh,907809070,120231628597870704,377626778776391_122189613242517872


In [14]:
items_list = []

for row in orders_raw.itertuples():
    d = row.raw_dict
    order_id = d.get("id")
    for item in d.get("items", []):
        items_list.append({
            "order_id": order_id,
            "product_id": item.get("product_id"),
            "variation_id": item.get("variation_id"),
            "quantity": item.get("quantity"),
        })

order_items = pd.DataFrame(items_list)
order_items.head(2)


Unnamed: 0,order_id,product_id,variation_id,quantity
0,40458,358156d8-2777-4aab-b584-a26813fb4b95,847cf093-8f90-4f01-85ae-800ec31432a1,1
1,40455,ea5b416d-511e-47d9-841a-957fb22e68a8,4e1af645-e80d-4e74-bcd1-e23c296999cf,1


## 4. Làm sạch

In [15]:
orders_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40236 entries, 0 to 40235
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   order_id           40236 non-null  object        
 1   shop_id            40236 non-null  int64         
 2   page_id            40226 non-null  object        
 3   customer_id        40236 non-null  object        
 4   status             40236 non-null  int64         
 5   status_name        40236 non-null  object        
 6   total_price        40236 non-null  int64         
 7   total_discount     40236 non-null  int64         
 8   shipping_fee       40236 non-null  int64         
 9   cod                40236 non-null  int64         
 10  order_currency     40236 non-null  object        
 11  created_at         40236 non-null  datetime64[ns]
 12  updated_at         40236 non-null  datetime64[ns]
 13  bill_full_name     40236 non-null  object        
 14  bill_p

In [16]:
order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46611 entries, 0 to 46610
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   order_id      46611 non-null  object
 1   product_id    46611 non-null  object
 2   variation_id  46611 non-null  object
 3   quantity      46611 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.4+ MB


## 5. Load vào tầng Silver

In [17]:
# Load Orders Clean → Silver DB
orders_clean.to_sql(
    "orders",
    silver_engine,
    if_exists="replace",   # hoặc "append" nếu chạy incremental
    index=False
)
print("✅ Load orders_clean vào Silver DB thành công!")


# Load Order Items → Silver DB
order_items.to_sql(
    "order_items",
    silver_engine,
    if_exists="replace",
    index=False
)
print("✅ Load order_items vào Silver DB thành công!")


✅ Load orders_clean vào Silver DB thành công!
✅ Load order_items vào Silver DB thành công!
