In [1]:
# ==========================
# 1. Import & Kết nối DB
# ==========================
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường
load_dotenv()

DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_SILVER = os.getenv("DB_SILVER")
DB_GOLD   = os.getenv("DB_GOLD")

engine_silver = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}"
)
engine_gold = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_GOLD}"
)


In [9]:
# Load dim_products
df_products = pd.read_sql("SELECT * FROM dim_products", engine_silver)
print("dim_products:", df_products.shape)

# Load order_items để so sánh
df_items = pd.read_sql("SELECT DISTINCT product_id FROM fact_order_items", engine_silver)
print("distinct product_id trong order_items:", df_items.shape)


dim_products: (37, 7)
distinct product_id trong order_items: (85, 1)


In [10]:
# ==========================
# Transform dim_products → gold_dim_products
# ==========================
def transform_products(df_products, df_items):
    df = df_products.copy()
    
    # B1. Chuẩn hóa tên
    df['name'] = df['name'].str.strip().str.title()
    
    # B2. Thêm category cơ bản
    df['main_category'] = df['category_ids'].fillna("Unknown")
    
    # B3. Flag active
    df['is_active'] = 1
    df['is_dummy'] = 0  # sản phẩm thật
    
    # B4. Tìm product_id thiếu trong dim_products
    product_ids_items = set(df_items['product_id'].unique())
    product_ids_dim = set(df['product_id'].unique())
    missing_products = product_ids_items - product_ids_dim
    
    print("Số product_id bị thiếu:", len(missing_products))
    
    # B5. Tạo dummy products cho các id thiếu
    if missing_products:
        df_missing = pd.DataFrame({'product_id': list(missing_products)})
        df_missing['name'] = "Unknown Product"
        df_missing['main_category'] = "Unknown"
        df_missing['is_active'] = 0
        df_missing['is_dummy'] = 1
        df = pd.concat([df, df_missing], ignore_index=True)
    
    # B6. Giữ lại cột cần thiết
    keep_cols = ['product_id', 'name', 'main_category', 'is_active', 'is_dummy']
    df_gold = df[keep_cols]
    
    return df_gold

df_products_gold = transform_products(df_products, df_items)
df_products_gold.head(20)


Số product_id bị thiếu: 50


Unnamed: 0,product_id,name,main_category,is_active,is_dummy
0,3ec3103b-ee8e-484a-abf3-d542f4487c18,Áo Thu Đông.W033,1290005112,1,0
1,6baccc22-e6a9-4fde-9bd9-52c7d2e1c4d9,Áo Khoác.W032,1290028415,1,0
2,3508fe85-4a72-4965-aec6-7a908553617a,Áo Thu Đông.W029,1290021501,1,0
3,358156d8-2777-4aab-b584-a26813fb4b95,Áo Thu Đông.W031,1290005112,1,0
4,6afa1a3e-4e23-46e8-a434-3018fdaebf12,Áo Thu Đông.W030,1290005112,1,0
5,7c6bbb64-b133-440b-8bcf-5342d5c49567,Áo Thu Đông.W028,86775,1,0
6,5ce7d440-388c-4ab4-b94e-59baa0f8ba52,Set Nam.W027,860019429,1,0
7,a2e62141-7107-46a5-afcb-d79e2a6fa5a3,Áo Thu Đông.W026,430016598,1,0
8,ea5b416d-511e-47d9-841a-957fb22e68a8,Áo Vest.W025,860019429,1,0
9,9764f7be-e5d1-4c68-92fe-067b2437342b,Áo Vest.W024,1290005112,1,0


In [12]:
df_products_gold.to_sql(
    "gold_dim_products",
    engine_gold,
    if_exists="replace",
    index=False
)

print("✅ Đã load gold_dim_products vào schema Gold.")


✅ Đã load gold_dim_products vào schema Gold.
