# A. Biến đổi và lấy những dữ liệu giá trị

## 1. Import thư viện & Config

In [1]:
import os
import json
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

# Load biến môi trường từ file .env
load_dotenv()

API_KEY   = os.getenv("API_KEY")
DB_USER   = os.getenv("DB_USER")
DB_PASS   = os.getenv("DB_PASS")
DB_HOST   = os.getenv("DB_HOST")
DB_PORT   = os.getenv("DB_PORT")
DB_BRONZE = os.getenv("DB_BRONZE")
DB_SILVER = os.getenv("DB_SILVER")

# Tạo engine kết nối đến Bronze DB và Silver DB
bronze_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_BRONZE}")
silver_engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_SILVER}")


## 2. Lấy dữ liệu Customers và Province từ tầng Bronze

In [2]:
customers_df = pd.read_sql("SELECT * FROM customers_raw", bronze_engine)
print("Shape:", customers_df.shape)
customers_df.head()


Shape: (36090, 4)


Unnamed: 0,shop_id,customer_id,raw_json,extracted_at
0,230361475,e906c5c7-ea19-42e5-a971-57cf09c94417,"{""returned_order_count"": 0, ""is_block"": false,...",2025-09-26 12:35:36
1,230361475,6c5b40d2-ae42-4e13-8019-f3f9127d75a0,"{""returned_order_count"": 0, ""is_block"": false,...",2025-09-26 12:35:36
2,230361475,765c531b-743f-4f30-b6b0-0d6f3579630f,"{""returned_order_count"": 0, ""is_block"": false,...",2025-09-26 12:35:36
3,230361475,f169b9a8-9a87-4c74-a296-ba9b505c63a6,"{""returned_order_count"": 0, ""is_block"": false,...",2025-09-26 12:35:36
4,230361475,4b7b436f-93ff-49ee-89e5-157eb85ff1bb,"{""returned_order_count"": 0, ""is_block"": false,...",2025-09-26 12:35:36


## 3. Biến đổi

Lấy những cột dữ liệu có giá trị

In [9]:
# Parse JSON từ raw_json
customers_df["raw_dict"] = customers_df["raw_json"].apply(json.loads)

# Extract field quan trọng từ raw_dict
customers_list = []
for row in customers_df.itertuples():
    d = row.raw_dict
    addr = d.get("shop_customer_addresses")[0] if d.get("shop_customer_addresses") else {}

    customers_list.append({
        "customer_id": d.get("customer_id"),          # ID khách hàng
        "shop_id": d.get("shop_id"),                  # ID shop
        "name": d.get("name"),                        # Tên KH
        "gender": d.get("gender"),                    # Giới tính
        "phone": d.get("phone_numbers")[0] if d.get("phone_numbers") else None,  # SĐT
        "fb_id": d.get("fb_id"),                      # Facebook ID
        "order_count": d.get("order_count"),          # Số lần mua hàng
        "succeed_order_count": d.get("succeed_order_count"), # Số lần mua thành công
        "returned_order_count": d.get("returned_order_count"), # Số lần trả hàng
        "purchased_amount": d.get("purchased_amount"), # Tổng chi tiêu
        "full_address": addr.get("full_address"),     # Địa chỉ đầy đủ
        "province_id": addr.get("province_id"),       # Mã tỉnh
        "inserted_at": d.get("inserted_at"),          # Ngày tạo
    })

customers_clean = pd.DataFrame(customers_list)

customers_clean.head()


Unnamed: 0,customer_id,shop_id,name,gender,phone,fb_id,order_count,succeed_order_count,returned_order_count,purchased_amount,full_address,province_id,inserted_at
0,c7a583d6-392d-4308-a02b-67c001ee03db,230361475,Thinh Bui,male,903693389,377626778776391_31407011708897267,1,0,0,0,,,2025-08-16T02:19:59
1,590ba7de-e4fe-43b3-8013-27fc69937edf,230361475,Truong Minh,male,907809070,377626778776391_24536870619280831,1,0,0,0,,,2025-08-15T12:56:50
2,a9e62388-57c2-4834-a87c-7c1025aea92d,230361475,Quach Quach,male,986533988,377626778776391_24712861355019072,1,0,0,0,,,2025-08-15T12:45:17
3,002810c4-5e96-4882-b85a-a85122e47108,230361475,Dung Vu,male,977752936,377626778776391_24254873767534636,1,0,0,0,,,2025-08-15T12:04:38
4,cd25d874-d782-40d1-8561-5507bbd6114d,230361475,Phạm Mụi,male,774443833,377626778776391_30782170181397391,1,0,0,0,,,2025-08-15T10:26:23


## 4. Làm sạch

In [10]:
customers_clean

Unnamed: 0,customer_id,shop_id,name,gender,phone,fb_id,order_count,succeed_order_count,returned_order_count,purchased_amount,full_address,province_id,inserted_at
0,c7a583d6-392d-4308-a02b-67c001ee03db,230361475,Thinh Bui,male,0903693389,377626778776391_31407011708897267,1,0,0,0,,,2025-08-16T02:19:59
1,590ba7de-e4fe-43b3-8013-27fc69937edf,230361475,Truong Minh,male,0907809070,377626778776391_24536870619280831,1,0,0,0,,,2025-08-15T12:56:50
2,a9e62388-57c2-4834-a87c-7c1025aea92d,230361475,Quach Quach,male,0986533988,377626778776391_24712861355019072,1,0,0,0,,,2025-08-15T12:45:17
3,002810c4-5e96-4882-b85a-a85122e47108,230361475,Dung Vu,male,0977752936,377626778776391_24254873767534636,1,0,0,0,,,2025-08-15T12:04:38
4,cd25d874-d782-40d1-8561-5507bbd6114d,230361475,Phạm Mụi,male,0774443833,377626778776391_30782170181397391,1,0,0,0,,,2025-08-15T10:26:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36085,2f707d13-84e7-4128-bb45-0edc9eadd839,230361475,Hồ Hằng,,0708125242,107105121585735_4780002535412519,1,0,0,0,,,2022-01-26T16:26:37
36086,38e64d8c-080f-44f7-bf9b-65bc6c8ce551,230361475,Hong Loan Thi,,0977725898,107105121585735_4795694433885349,1,0,0,0,,,2022-01-26T16:26:17
36087,ea72b48f-b32f-408f-b749-b910a48bcd00,230361475,Trang Võ,,0962377810,107105121585735_4871714639578304,1,0,0,0,,,2022-01-26T16:25:53
36088,30ad3a45-59d2-4015-a9fc-cc1cc0e56f2d,230361475,Thu Thuy,,0976035779,107105121585735_4774890722602773,1,0,0,0,,,2022-01-26T16:25:28


### 4.1. Chuẩn hóa kiểu dữ liệu

In [13]:
customers_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36090 entries, 0 to 36089
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   customer_id           36090 non-null  object        
 1   shop_id               36090 non-null  int64         
 2   name                  36083 non-null  object        
 3   gender                30729 non-null  object        
 4   phone                 36000 non-null  object        
 5   fb_id                 35955 non-null  object        
 6   order_count           36090 non-null  int64         
 7   succeed_order_count   36090 non-null  int64         
 8   returned_order_count  36090 non-null  int64         
 9   purchased_amount      36090 non-null  int64         
 10  full_address          32459 non-null  object        
 11  province_id           32449 non-null  object        
 12  inserted_at           36090 non-null  datetime64[ns]
dtypes: datetime64[ns

In [12]:
# Chuyển đổi kiểu dữ liệu inserted_at sang datetime
customers_clean["inserted_at"] = pd.to_datetime(customers_clean["inserted_at"])

In [14]:
customers_clean.duplicated().sum()

np.int64(0)

### 4.2. Điền giá trị null bằng unknown

In [15]:
customers_clean.isna().sum()

customer_id                0
shop_id                    0
name                       7
gender                  5361
phone                     90
fb_id                    135
order_count                0
succeed_order_count        0
returned_order_count       0
purchased_amount           0
full_address            3631
province_id             3641
inserted_at                0
dtype: int64

In [16]:
# fill null values with 'unknown'
customers_clean.fillna('unknown', inplace=True)

## 4.3. Join với bảng Province để có thông tin tỉnh/ thành phố

In [17]:
provinces_df = pd.read_sql("SELECT * FROM province_raw", bronze_engine)
print("Shape:", provinces_df.shape)
provinces_df.head()

Shape: (63, 2)


Unnamed: 0,province_id,province_name
0,805,An Giang
1,221,Bắc Giang
2,207,Bắc Kạn
3,821,Bạc Liêu
4,106,Bắc Ninh


In [18]:
provinces_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   province_id    63 non-null     object
 1   province_name  63 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


In [19]:
customers_enriched = customers_clean.merge(
    provinces_df,
    on="province_id",
    how="left"   # dùng left để giữ nguyên tất cả khách hàng
)

customers_enriched.head()


Unnamed: 0,customer_id,shop_id,name,gender,phone,fb_id,order_count,succeed_order_count,returned_order_count,purchased_amount,full_address,province_id,inserted_at,province_name
0,c7a583d6-392d-4308-a02b-67c001ee03db,230361475,Thinh Bui,male,903693389,377626778776391_31407011708897267,1,0,0,0,unknown,unknown,2025-08-16 02:19:59,
1,590ba7de-e4fe-43b3-8013-27fc69937edf,230361475,Truong Minh,male,907809070,377626778776391_24536870619280831,1,0,0,0,unknown,unknown,2025-08-15 12:56:50,
2,a9e62388-57c2-4834-a87c-7c1025aea92d,230361475,Quach Quach,male,986533988,377626778776391_24712861355019072,1,0,0,0,unknown,unknown,2025-08-15 12:45:17,
3,002810c4-5e96-4882-b85a-a85122e47108,230361475,Dung Vu,male,977752936,377626778776391_24254873767534636,1,0,0,0,unknown,unknown,2025-08-15 12:04:38,
4,cd25d874-d782-40d1-8561-5507bbd6114d,230361475,Phạm Mụi,male,774443833,377626778776391_30782170181397391,1,0,0,0,unknown,unknown,2025-08-15 10:26:23,


In [23]:
customers_enriched['province_name'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 36090 entries, 0 to 36089
Series name: province_name
Non-Null Count  Dtype 
--------------  ----- 
36090 non-null  object
dtypes: object(1)
memory usage: 282.1+ KB


In [22]:
# fill null values with 'unknown'
customers_enriched.fillna('unknown', inplace=True)

## 5. Load vào tầng Silver

In [24]:
customers_enriched.to_sql(
    "customers",
    silver_engine,
    if_exists="replace",   # "append" nếu muốn thêm nhiều lần
    index=False
)

print("✅ Load customers vào Silver DB thành công!")

✅ Load customers vào Silver DB thành công!
