## 1. Import thư viện & Config

In [15]:
import os
import json
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường từ file .env
load_dotenv()

API_KEY   = os.getenv("API_KEY")
DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_BRONZE = os.getenv("DB_BRONZE")
DB_SILVER = os.getenv("DB_SILVER")

# Tạo engine kết nối đến Bronze DB và Silver DB
bronze_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_BRONZE}")
silver_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}")

## 2. Lấy dữ liệu Products từ Bronze

In [None]:
products_raw = pd.read_sql("SELECT * FROM products_raw", bronze_engine)
print("Shape:", products_raw.shape)
products_raw.head()

Shape: (37, 4)


Unnamed: 0,shop_id,product_id,raw_json,extracted_at
0,230361475,3ec3103b-ee8e-484a-abf3-d542f4487c18,"{""categories"": [{""id"": 1290005112, ""name"": ""Ta...",2025-09-26 12:27:04
1,230361475,6baccc22-e6a9-4fde-9bd9-52c7d2e1c4d9,"{""categories"": [{""id"": 1290028415, ""name"": ""Lê...",2025-09-26 12:27:04
2,230361475,3508fe85-4a72-4965-aec6-7a908553617a,"{""categories"": [{""id"": 1290021501, ""name"": ""Tr...",2025-09-26 12:27:04
3,230361475,358156d8-2777-4aab-b584-a26813fb4b95,"{""categories"": [{""id"": 1290005112, ""name"": ""Ta...",2025-09-26 12:27:04
4,230361475,6afa1a3e-4e23-46e8-a434-3018fdaebf12,"{""categories"": [{""id"": 1290005112, ""name"": ""Ta...",2025-09-26 12:27:04


## 3. Biến đổi

In [17]:
# Parse JSON từ cột raw_json
products_raw["raw_dict"] = products_raw["raw_json"].apply(json.loads)

sample_product = products_raw["raw_dict"].iloc[0]
sample_product.keys()




In [28]:
# Extract bảng Products
products_clean = pd.json_normalize(products_raw["raw_dict"])

products_clean = products_clean[[
    "id", "name", "inserted_at", "updated_at", "shop_id", "category_ids"
]]

products_clean.rename(columns={
    "id": "product_id",
    "name": "product_name",
    "category_ids": "staff_id"
}, inplace=True)

products_clean.head(2)


Unnamed: 0,product_id,product_name,inserted_at,updated_at,shop_id,staff_id
0,3ec3103b-ee8e-484a-abf3-d542f4487c18,Áo thu đông.W033,2024-12-23T08:36:41,2024-12-23T08:36:41,230361475,[1290005112]
1,6baccc22-e6a9-4fde-9bd9-52c7d2e1c4d9,Áo Khoác.W032,2024-12-23T06:00:26,2024-12-25T03:13:05,230361475,[1290028415]


In [19]:
# Extract bảng Variations

variations_list = []

for row in products_raw["raw_dict"]:
    product_id = row["id"]
    for v in row.get("variations", []):
        variation = {
            "variation_id": v["id"],
            "product_id": product_id,
            "display_id": v.get("display_id"),
            "color": None,
            "size": None,
            "retail_price": v.get("retail_price"),
            "retail_price_after_discount": v.get("retail_price_after_discount"),
            "inserted_at": v.get("inserted_at"),
        }
        # Lấy fields (Màu, Size)
        for f in v.get("fields", []):
            if f["name"].lower() == "màu":
                variation["color"] = f["value"]
            elif f["name"].lower() == "size":
                variation["size"] = f["value"]
        variations_list.append(variation)
product_variations = pd.DataFrame(variations_list)
product_variations.head(2)


Unnamed: 0,variation_id,product_id,display_id,color,size,retail_price,retail_price_after_discount,inserted_at
0,b5e77d02-e957-4e8c-9d25-d8f06d036a06,3ec3103b-ee8e-484a-abf3-d542f4487c18,WINNER-AOTHUDONG.W033DENXS,Đen,XS,299000,299000,2024-12-23T08:36:41.417311
1,e1a00c27-324b-4013-8b36-f0a1c8f28a63,3ec3103b-ee8e-484a-abf3-d542f4487c18,WINNER-AOTHUDONG.W033DENS,Đen,S,299000,299000,2024-12-23T08:36:41.417830


In [21]:
# Extract bảng Categories (dimension độc lập)
categories_list = []

for row in products_raw["raw_dict"]:
    for c in row.get("categories", []):
        categories_list.append({
            "staff_id": c.get("id"),
            "name": c.get("name")
        })

product_staff = pd.DataFrame(categories_list).drop_duplicates()

product_staff

Unnamed: 0,staff_id,name
0,1290005112,Tam Anh
1,1290028415,Lê Phong
2,1290021501,Truong.NQ
5,86775,Thái
6,860019429,LQ.Long
7,430016598,Thien.TM
11,100905405,Đình Tùng
13,1720020941,Truong.LQ
18,17160,Quốc Đại


## 4. Làm sạch

In [25]:
products_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    37 non-null     object
 1   product_name  37 non-null     object
 2   inserted_at   37 non-null     object
 3   updated_at    37 non-null     object
 4   shop_id       37 non-null     int64 
 5   staff_id      32 non-null     Int64 
dtypes: Int64(1), int64(1), object(4)
memory usage: 1.9+ KB


In [29]:
# Lấy phần tử đầu tiên của list, nếu có
products_clean["staff_id"] = products_clean["staff_id"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

# Ép kiểu int
products_clean["staff_id"] = products_clean["staff_id"].astype("Int64")

products_clean[["product_id", "staff_id"]].head()


Unnamed: 0,product_id,staff_id
0,3ec3103b-ee8e-484a-abf3-d542f4487c18,1290005112
1,6baccc22-e6a9-4fde-9bd9-52c7d2e1c4d9,1290028415
2,3508fe85-4a72-4965-aec6-7a908553617a,1290021501
3,358156d8-2777-4aab-b584-a26813fb4b95,1290005112
4,6afa1a3e-4e23-46e8-a434-3018fdaebf12,1290005112


In [35]:
product_variations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   variation_id                 415 non-null    object        
 1   product_id                   415 non-null    object        
 2   display_id                   415 non-null    object        
 3   color                        415 non-null    object        
 4   size                         415 non-null    object        
 5   retail_price                 415 non-null    int64         
 6   retail_price_after_discount  415 non-null    int64         
 7   inserted_at                  415 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 26.1+ KB


In [34]:
# Chuyển đổi kiểu dữ liệu inserted_at sang datetime
product_variations["inserted_at"] = pd.to_datetime(product_variations["inserted_at"])

## 5. Load vào tầng Silver

In [36]:
# Load bảng Products
products_clean.to_sql(
    "products",
    silver_engine,
    if_exists="replace",   # "append" nếu muốn ghi thêm
    index=False
)
print("✅ Load products_clean vào Silver DB thành công!")


# Load bảng Product Variations
product_variations.to_sql(
    "product_variations",
    silver_engine,
    if_exists="replace",
    index=False
)
print("✅ Load product_variations vào Silver DB thành công!")


# Load bảng Product Staff
product_staff.to_sql(
    "product_staff",
    silver_engine,
    if_exists="replace",
    index=False
)
print("✅ Load product_staff vào Silver DB thành công!")


✅ Load products_clean vào Silver DB thành công!
✅ Load product_variations vào Silver DB thành công!
✅ Load product_staff vào Silver DB thành công!
