In [7]:
# ==========================
# 1. Import & Kết nối DB
# ==========================
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường
load_dotenv()

DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_BRONZE = os.getenv("DB_BRONZE")
DB_SILVER = os.getenv("DB_SILVER")
DB_GOLD   = os.getenv("DB_GOLD")

engine_bronze = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_BRONZE}"
)
engine_silver = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}"
)
engine_gold = create_engine(
    f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_GOLD}"
)


In [8]:
# ==========================
# Load province_raw từ Bronze
# ==========================
query = "SELECT * FROM province_raw"
df_provinces = pd.read_sql(query, engine_bronze)

print("Shape:", df_provinces.shape)
df_provinces.head(10)


Shape: (63, 2)


Unnamed: 0,province_id,province_name
0,805,An Giang
1,221,Bắc Giang
2,207,Bắc Kạn
3,821,Bạc Liêu
4,106,Bắc Ninh
5,717,Bà Rịa-Vũng Tàu
6,811,Bến Tre
7,507,Bình Định
8,711,Bình Dương
9,707,Bình Phước


In [9]:
# ==========================
# 2. Load dim_customers từ Silver
# ==========================
query = "SELECT * FROM dim_customers"
df_customers = pd.read_sql(query, engine_silver)

print("Shape:", df_customers.shape)
df_customers.head(5)


Shape: (36090, 23)


Unnamed: 0,customer_id,shop_id,name,gender,phone,fb_id,order_count,succeed_order_count,returned_order_count,purchased_amount,...,inserted_at,updated_at,last_order_at,is_block,is_discount_by_level,active_levera_pay,province_id,full_address,referral_code,order_sources
0,c7a583d6-392d-4308-a02b-67c001ee03db,230361475,Thinh Bui,M,903693389,377626778776391_31407011708897267,1,0,0,0,...,2025-08-16 02:19:59,2025-08-16 02:19:59,NaT,0,1,0,,,GM5wdxsy,-1
1,590ba7de-e4fe-43b3-8013-27fc69937edf,230361475,Truong Minh,M,907809070,377626778776391_24536870619280831,1,0,0,0,...,2025-08-15 12:56:50,2025-08-15 12:56:50,NaT,0,1,0,,,6zqy77Bu,-1
2,a9e62388-57c2-4834-a87c-7c1025aea92d,230361475,Quach Quach,M,986533988,377626778776391_24712861355019072,1,0,0,0,...,2025-08-15 12:45:17,2025-08-15 12:45:17,NaT,0,1,0,,,uyMB3h6E,-1
3,002810c4-5e96-4882-b85a-a85122e47108,230361475,Dung Vu,M,977752936,377626778776391_24254873767534636,1,0,0,0,...,2025-08-15 12:04:38,2025-08-15 12:04:38,NaT,0,1,0,,,SnU6T9jj,-1
4,cd25d874-d782-40d1-8561-5507bbd6114d,230361475,Phạm Mụi,M,774443833,377626778776391_30782170181397391,1,0,0,0,...,2025-08-15 10:26:23,2025-08-15 10:26:23,NaT,0,1,0,,,ng7CfaOM,-1


In [10]:
df_customers.info()
df_customers.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36090 entries, 0 to 36089
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   customer_id           36090 non-null  object        
 1   shop_id               36090 non-null  int64         
 2   name                  36083 non-null  object        
 3   gender                30729 non-null  object        
 4   phone                 36000 non-null  object        
 5   fb_id                 35955 non-null  object        
 6   order_count           36090 non-null  int64         
 7   succeed_order_count   36090 non-null  int64         
 8   returned_order_count  36090 non-null  int64         
 9   purchased_amount      36090 non-null  int64         
 10  reward_point          36090 non-null  int64         
 11  current_debts         36090 non-null  int64         
 12  count_referrals       36090 non-null  int64         
 13  inserted_at     

Unnamed: 0,customer_id,shop_id,name,gender,phone,fb_id,order_count,succeed_order_count,returned_order_count,purchased_amount,...,inserted_at,updated_at,last_order_at,is_block,is_discount_by_level,active_levera_pay,province_id,full_address,referral_code,order_sources
count,36090,36090.0,36083,30729,36000.0,35955,36090.0,36090.0,36090.0,36090.0,...,36090,36090,26676,36090.0,36090.0,36090.0,32449.0,32459,36090,36060.0
unique,36090,,25253,3,35716.0,35955,,,,,...,,,,,,,63.0,32326,36090,2.0
top,c7a583d6-392d-4308-a02b-67c001ee03db,,Nguyễn Hà,F,335222986.0,377626778776391_31407011708897267,,,,,...,,,,,,,101.0,"Ấp minh phong, Xã Bình An, Huyện Châu Thành, K...",GM5wdxsy,-1.0
freq,1,,49,19486,4.0,1,,,,,...,,,,,,,6069.0,3,1,36059.0
mean,,230361475.0,,,,,1.115129,0.489,0.378498,222192.6,...,2024-08-10 23:41:22.697256960,2024-08-14 09:57:39.321363200,2024-07-14 17:23:43.588431616,2.8e-05,1.0,8.3e-05,,,,
min,,230361475.0,,,,,0.0,0.0,0.0,-1788000.0,...,2021-12-30 03:13:14,2024-02-05 01:07:19,2021-12-30 03:13:14,0.0,1.0,0.0,,,,
25%,,230361475.0,,,,,1.0,0.0,0.0,0.0,...,2024-05-05 03:00:41.750000128,2024-05-06 14:20:34.750000128,2024-04-17 15:02:32.750000128,0.0,1.0,0.0,,,,
50%,,230361475.0,,,,,1.0,0.0,0.0,0.0,...,2024-07-17 03:25:57,2024-07-19 10:21:15.500000,2024-06-28 01:57:25,0.0,1.0,0.0,,,,
75%,,230361475.0,,,,,1.0,1.0,1.0,398000.0,...,2024-12-03 01:10:28.500000,2024-12-04 07:41:24.750000128,2024-10-07 10:17:34.750000128,0.0,1.0,0.0,,,,
max,,230361475.0,,,,,11.0,5.0,4.0,6199000.0,...,2025-08-16 02:19:59,2025-08-16 02:19:59,2025-04-18 12:48:01,1.0,1.0,1.0,,,,


In [11]:
df_customers['gender'].value_counts()

gender
F    19486
M    11239
O        4
Name: count, dtype: int64

In [22]:
# ==========================
# Transform dim_customers → gold_dim_customers
# ==========================
def transform_customers(df, df_provinces=None):
    df = df.copy()
    
    # B1. Chuẩn hóa giới tính
    df['gender'] = df['gender'].map({'M': 'Nam', 'F': 'Nữ', 'O': 'Khác'}).fillna('Khác')

    # B2. Chuẩn hóa số điện thoại
    df['phone'] = df['phone'].astype(str).str.replace(r'\D', '', regex=True).str[-10:]

    # B3. Flag VIP
    df['is_vip'] = df['purchased_amount'].apply(lambda x: 1 if x >= 500000 else 0)

    # B4. RFM cơ bản
    today = pd.to_datetime("today")
    df['recency_days'] = (today - pd.to_datetime(df['last_order_at'])).dt.days.fillna(-1)
    df['frequency'] = df['order_count']
    df['monetary'] = df['purchased_amount']

    # B5. Join với bảng tỉnh/thành (nếu có)
    if df_provinces is not None:
        df = df.merge(df_provinces[['province_id','province_name']], 
                      on='province_id', how='left')

    # B6. Giữ lại cột quan trọng
    keep_cols = [
        'customer_id', 'shop_id', 'name', 'gender', 'phone',
        'order_count', 'succeed_order_count', 'returned_order_count',
        'purchased_amount', 'last_order_at',
        'recency_days', 'frequency', 'monetary', 'is_vip',
        'province_id', 'province_name'
    ]
    
    df_gold = df[[c for c in keep_cols if c in df.columns]]

    return df_gold


In [23]:
df_gold_customers = transform_customers(df_customers, df_provinces)
df_gold_customers

Unnamed: 0,customer_id,shop_id,name,gender,phone,order_count,succeed_order_count,returned_order_count,purchased_amount,last_order_at,recency_days,frequency,monetary,is_vip,province_id,province_name
0,c7a583d6-392d-4308-a02b-67c001ee03db,230361475,Thinh Bui,Nam,0903693389,1,0,0,0,NaT,-1.0,1,0,0,,
1,590ba7de-e4fe-43b3-8013-27fc69937edf,230361475,Truong Minh,Nam,0907809070,1,0,0,0,NaT,-1.0,1,0,0,,
2,a9e62388-57c2-4834-a87c-7c1025aea92d,230361475,Quach Quach,Nam,0986533988,1,0,0,0,NaT,-1.0,1,0,0,,
3,002810c4-5e96-4882-b85a-a85122e47108,230361475,Dung Vu,Nam,0977752936,1,0,0,0,NaT,-1.0,1,0,0,,
4,cd25d874-d782-40d1-8561-5507bbd6114d,230361475,Phạm Mụi,Nam,0774443833,1,0,0,0,NaT,-1.0,1,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36085,2f707d13-84e7-4128-bb45-0edc9eadd839,230361475,Hồ Hằng,Khác,0708125242,1,0,0,0,2022-01-26 16:26:37,1344.0,1,0,0,,
36086,38e64d8c-080f-44f7-bf9b-65bc6c8ce551,230361475,Hong Loan Thi,Khác,0977725898,1,0,0,0,2022-01-26 16:26:17,1344.0,1,0,0,,
36087,ea72b48f-b32f-408f-b749-b910a48bcd00,230361475,Trang Võ,Khác,0962377810,1,0,0,0,2022-01-26 16:25:53,1344.0,1,0,0,,
36088,30ad3a45-59d2-4015-a9fc-cc1cc0e56f2d,230361475,Thu Thuy,Khác,0976035779,1,0,0,0,2022-01-26 16:25:28,1344.0,1,0,0,,


In [24]:
df_gold_customers['is_vip'].value_counts()

is_vip
0    28767
1     7323
Name: count, dtype: int64

In [25]:
df_gold_customers.to_sql(
    "gold_dim_customers",
    engine_gold,
    if_exists="replace",
    index=False
)
print("✅ Đã load gold_dim_customers vào schema Gold.")

✅ Đã load gold_dim_customers vào schema Gold.
