## Notebook: Transform dữ liệu từ tầng broze lên tầng silver

### 1: Import & Connect DB

In [None]:
import os
import json
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import BigInteger, VARCHAR, DateTime, Text
from dotenv import load_dotenv

load_dotenv()

DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_BRONZE = os.getenv("DB_BRONZE")
DB_SILVER = os.getenv("DB_SILVER")

bronze_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_BRONZE}")
silver_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}")


### 2. Load raw shops từ Bronze

In [None]:
shops_df = pd.read_sql("SELECT * FROM shops_raw", bronze_engine)
print("Shape:", shops_df.shape)

# Xem 1 record mẫu
print(json.dumps(json.loads(shops_df["raw_json"].iloc[0]), indent=2, ensure_ascii=False))


Shape: (1, 3)
{
  "avatar_url": "https://content.pancake.vn/2-24/2024/2/4/e0b42ce77a7b7922388df987610dbf255705a744.png",
  "currency": "VND",
  "id": 230361475,
  "link_post_marketer": [],
  "name": "Winner Group",
  "pages": [
    {
      "id": "490071977529040",
      "is_onboard_xendit": null,
      "name": "LiLi Store",
      "platform": "facebook",
      "progressive_catalog_error": "missing_business_id",
      "settings": {
        "auto_create_order": true,
        "current_settings_key": "vz+8J5GNL2xQu78S"
      },
      "shop_id": 230361475,
      "tags": [
        {
          "color": "#bd2727",
          "id": 5,
          "lighten_color": "#ebbebe",
          "text": "MHAO"
        },
        {
          "color": "#4b5577",
          "id": 0,
          "lighten_color": "#c9ccd6",
          "text": "Kiểm hàng"
        },
        {
          "color": "#a1992b",
          "id": 1,
          "lighten_color": "rgba(161,153,43,0.4)",
          "text": "Thanh "
        },
        

### 3. Liệt kê keys JSON (shops)

In [None]:
all_keys = set()
for raw in shops_df["raw_json"].head(20):  # scan 20 bản ghi đầu
    d = json.loads(raw)
    all_keys.update(d.keys())

print("===== Tổng số keys:", len(all_keys), " =====")
print(sorted(all_keys))


===== Tổng số keys: 6  =====
['avatar_url', 'currency', 'id', 'link_post_marketer', 'name', 'pages']


### 4. Parse dim_shops

In [None]:
def parse_shop(row):
    d = json.loads(row["raw_json"])
    return {
        "shop_id": d.get("id"),
        "shop_name": d.get("name"),
        "currency": d.get("currency"),
        "avatar_url": d.get("avatar_url")
    }

dim_shops = shops_df.apply(parse_shop, axis=1, result_type="expand").drop_duplicates("shop_id")
print("✅ dim_shops shape:", dim_shops.shape)
display(dim_shops.head())


✅ dim_shops shape: (1, 4)


Unnamed: 0,shop_id,shop_name,currency,avatar_url
0,230361475,Winner Group,VND,https://content.pancake.vn/2-24/2024/2/4/e0b42...


### 5. Load vào Silver

In [None]:
dtype_dim_shops = {
    "shop_id": BigInteger(),
    "shop_name": VARCHAR(255),
    "currency": VARCHAR(10),
    "avatar_url": VARCHAR(500)
}

dim_shops.to_sql(
    "dim_shops",
    con=silver_engine,
    if_exists="replace",
    index=False,
    dtype=dtype_dim_shops
)

print("✅ Loaded to Silver.dim_shops")


✅ Loaded to Silver.dim_shops
