In [1]:
# ==========================
# 1. Import & Kết nối DB
# ==========================
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường
load_dotenv()

DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_SILVER = os.getenv("DB_SILVER")
DB_GOLD   = os.getenv("DB_GOLD")

engine_silver = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}"
)
engine_gold = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_GOLD}"
)


In [2]:
# ==========================
# 2. Load dim_pages từ Silver
# ==========================
query = "SELECT * FROM dim_pages"
df_pages = pd.read_sql(query, engine_silver)

print("Shape:", df_pages.shape)
df_pages.head(10)


Shape: (74, 5)


Unnamed: 0,page_id,page_name,page_username,account,account_name
0,377626778776391,Duy Mạnh Authentic,duymanhauth10,377626778776391,Duy Mạnh Authentic
1,492088430643389,Sùn Outlet,sunoutl,492088430643389,Sùn Outlet
2,433231143199907,Đăng Authentic,dangauth2006,433231143199907,Đăng Authentic
3,491931647329576,Famous Authentic,famousauthentic1907,491931647329576,Famous Authentic
4,454214751109901,Săn hàng hiệu US giá rẻ,SanHangUs2,454214751109901,Săn hàng hiệu US giá rẻ
5,428194993710941,D&Đ - Chuyên Hàng Xách Tay US,,428194993710941,D&Đ - Chuyên Hàng Xách Tay US
6,461234203738139,Fame Outlet,fameo28,461234203738139,Fame Outlet
7,418470191354740,Hào Nam Authenio,haonamauthentio1907,418470191354740,Hào Nam Authenio
8,526129833907763,Gentleman’s Couture,gentlemancouture1907,526129833907763,Gentleman’s Couture
9,491458047386615,Sùn Hàng Hiệu Oulet,thunusa,491458047386615,Sùn Hàng Hiệu Oulet


In [3]:
# ==========================
# 3. Khám phá dữ liệu
# ==========================
df_pages.info()
df_pages.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   page_id        73 non-null     object
 1   page_name      73 non-null     object
 2   page_username  33 non-null     object
 3   account        73 non-null     object
 4   account_name   74 non-null     object
dtypes: object(5)
memory usage: 3.0+ KB


Unnamed: 0,page_id,page_name,page_username,account,account_name
count,73,73,33,73,74
unique,73,72,33,73,73
top,377626778776391,Xưởng May Kim Anh,duymanhauth10,377626778776391,Xưởng May Kim Anh
freq,1,2,1,1,2


In [4]:
# ==========================
# Transform dim_pages → gold_dim_pages
# ==========================
def transform_pages(df):
    df = df.copy()

    # B1. Loại bỏ record thiếu page_id hoặc page_name
    df = df.dropna(subset=['page_id', 'page_name'])

    # B2. Chuẩn hóa text
    df['page_name'] = df['page_name'].str.strip().str.title()
    if 'page_username' in df.columns:
        df['page_username'] = df['page_username'].str.strip().str.lower()


    # B4. Giữ lại cột quan trọng
    keep_cols = [
        'page_id', 'page_name', 'page_username', 
        'account', 'account_name'
    ]
    df_gold = df[keep_cols]

    return df_gold


In [6]:
df_gold_pages = transform_pages(df_pages)
df_gold_pages

Unnamed: 0,page_id,page_name,page_username,account,account_name
0,377626778776391,Duy Mạnh Authentic,duymanhauth10,377626778776391,Duy Mạnh Authentic
1,492088430643389,Sùn Outlet,sunoutl,492088430643389,Sùn Outlet
2,433231143199907,Đăng Authentic,dangauth2006,433231143199907,Đăng Authentic
3,491931647329576,Famous Authentic,famousauthentic1907,491931647329576,Famous Authentic
4,454214751109901,Săn Hàng Hiệu Us Giá Rẻ,sanhangus2,454214751109901,Săn hàng hiệu US giá rẻ
...,...,...,...,...,...
69,104093599354419,Anh Thư Shop,,104093599354419,Anh Thư Shop
70,110406248113415,Xưởng May Kim Anh,kimanh.thuonghieu,110406248113415,Xưởng May Kim Anh
71,241364392389685,Patino'S Store,,241364392389685,Patino's Store
72,104020928940913,Emmy Fashion,emmyfashionst1,104020928940913,Emmy Fashion


In [7]:
# load vào Gold
df_gold_pages.to_sql(
    "gold_dim_pages",
    engine_gold,
    if_exists="replace",
    index=False
)

73