### LIBRARY

In [76]:
import pandas as pd
import requests
import random
import time

### SET UP THE ENVIRONMENT

In [77]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
    "Accept-Language": 'en-US,en;q=0.9',
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://tiki.vn/",
    "From": "",
    "af-ac-enc-dat": "",
    "x-api-source": "pc"
}

### EXTRACT GROUP DATA

In [78]:
URL = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"

In [79]:
response = requests.get(URL, headers=HEADERS)
time.sleep(random.uniform(3.2, 8.7))
if response.status_code == 200:
    data = response.json()
    print(f"Success to fetch {len(data["menu_block"]["items"])} groups.")
else:
    print("Failed to fetch data:", response.status_code)


Success to fetch 26 groups.


### EXTRACT GROUP ID
Extract the group id = 1846, 1789

In [80]:
group = data["menu_block"]["items"]
group_list = []
for group in group:
    link = group["link"]
    group_id = link.split("/")[-1][1:]
    text = group["text"]
    
    # if group_id in ["1789"]:
    if group_id in ["1846", "1789"]:
        group_list.append([group_id, text])

group = pd.DataFrame(group_list, columns=["GroupID", "Name"])
print(f"Success to extract {len(group)} needed groups.")
print(group["GroupID"].values)

Success to extract 2 needed groups.
['1789' '1846']


### EXTRACT CATEGORY HIERARCHY

In [81]:
category_list = []
for group_id, group_name in zip(group["GroupID"], group["Name"]):
    parent_url = f"https://tiki.vn/api/v2/categories?parent_id={group_id}"
    parent_response = requests.get(parent_url, headers=HEADERS)
    time.sleep(random.uniform(3.2, 8.7))
    if parent_response.status_code == 200:
        parent_data = parent_response.json()
        if not parent_data["data"]:
            category_list.append([group_id, group_name, None, None, None, None, None, None])
        else:
            for parent_category in parent_data["data"]:
                parent_id = parent_category["id"]
                parent_name = parent_category["name"]
                child_url = f"https://tiki.vn/api/v2/categories?parent_id={parent_id}"
                child_response = requests.get(child_url, headers=HEADERS)
                time.sleep(random.uniform(3.2, 8.7))
                if child_response.status_code == 200:
                    child_data = child_response.json()
                    if not child_data["data"]:
                        category_list.append([group_id, group_name, parent_id, parent_name, None, None, None, None])
                    else:
                        for child_category in child_data["data"]:
                            child_id = child_category["id"]
                            child_name = child_category["name"]
                            type_url = f"https://tiki.vn/api/v2/categories?parent_id={child_id}"
                            type_response = requests.get(type_url, headers=HEADERS)
                            time.sleep(random.uniform(3.2, 8.7))
                            if type_response.status_code == 200:
                                type_data = type_response.json()
                                if type_data["data"]:
                                    for type_item in type_data["data"]:
                                        type_id = type_item.get("id")
                                        type_name = type_item.get("name")
                                        category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, type_id, type_name])
                                else:
                                    category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, None, None])

category_df = pd.DataFrame(category_list, columns=["GroupID", "GroupName", "MasterCategoryID", "MasterCategoryName", "CategoryID", "CategoryName", "SubCategoryID", "SubCategoryName"])

In [82]:
print(f"Success to fetch {len(category_df)} categories.")

Success to fetch 82 categories.


### TRANSFORM CATEGORY HIERARCHY

In [83]:
def remove_single_category(df):
    # Group the DataFrame by MasterCategoryID and count the unique CategoryID values
    category_counts = df.groupby('MasterCategoryID')['CategoryID'].nunique()
    
    # Create a mask for MasterCategoryIDs with only one CategoryID
    single_category_mask = category_counts == 1
    
    # Drop the CategoryID and CategoryName columns for rows where there is only one CategoryID
    df.loc[df['MasterCategoryID'].isin(single_category_mask[single_category_mask].index), ['CategoryID', 'CategoryName']] = [None, None]
    
    return df

In [84]:
cleaned_df = remove_single_category(category_df)

In [85]:
def transform_category(row):
    # Handle MasterCategory and Category
    if pd.isna(row['CategoryID']):
        master_category_id = row['MasterCategoryID']
        master_category_name = row['MasterCategoryName']
        category_id = row['MasterCategoryID']
        category_name = row['MasterCategoryName']
        is_category = 0
    else:
        master_category_id = row['MasterCategoryID']
        master_category_name = row['MasterCategoryName']
        category_id = row['CategoryID']
        category_name = row['CategoryName']
        is_category = 1

    # Handle SubCategory
    if pd.isna(row['SubCategoryID']):
        sub_category_id = category_id
        sub_category_name = category_name
        is_sub_category = 0
    else:
        sub_category_id = row['SubCategoryID']
        sub_category_name = row['SubCategoryName']
        is_sub_category = 1

    return pd.Series([
        master_category_id, master_category_name, category_id, category_name,
        is_category, sub_category_id, sub_category_name, is_sub_category
    ])

In [86]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category_df.head()

Check null for each column in category
Null percentage for GroupID: 0.00%
Null percentage for GroupName: 0.00%
Null percentage for MasterCategoryID: 0.00%
Null percentage for MasterCategoryName: 0.00%
Null percentage for CategoryID: 6.10%
Null percentage for CategoryName: 6.10%
Null percentage for SubCategoryID: 57.32%
Null percentage for SubCategoryName: 57.32%


Unnamed: 0,GroupID,GroupName,MasterCategoryID,MasterCategoryName,CategoryID,CategoryName,SubCategoryID,SubCategoryName
0,1789,Điện Thoại - Máy Tính Bảng,1795,Điện thoại Smartphone,,,,
1,1789,Điện Thoại - Máy Tính Bảng,1794,Máy tính bảng,,,,
2,1789,Điện Thoại - Máy Tính Bảng,28856,Máy đọc sách,,,,
3,1789,Điện Thoại - Máy Tính Bảng,1796,Điện thoại phổ thông,,,,
4,1789,Điện Thoại - Máy Tính Bảng,8061,Điện thoại bàn,,,,


In [87]:
category_df[['MasterCategoryID', 'MasterCategoryName', 'CategoryID', 'CategoryName',
             'isCategory', 'SubCategoryID', 'SubCategoryName', 'isSubCategory']] = category_df.apply(transform_category, axis=1, result_type='expand')

In [88]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category_df.head()

Check null for each column in category
Null percentage for GroupID: 0.00%
Null percentage for GroupName: 0.00%
Null percentage for MasterCategoryID: 0.00%
Null percentage for MasterCategoryName: 0.00%
Null percentage for CategoryID: 0.00%
Null percentage for CategoryName: 0.00%
Null percentage for SubCategoryID: 0.00%
Null percentage for SubCategoryName: 0.00%
Null percentage for isCategory: 0.00%
Null percentage for isSubCategory: 0.00%


Unnamed: 0,GroupID,GroupName,MasterCategoryID,MasterCategoryName,CategoryID,CategoryName,SubCategoryID,SubCategoryName,isCategory,isSubCategory
0,1789,Điện Thoại - Máy Tính Bảng,1795,Điện thoại Smartphone,1795.0,Điện thoại Smartphone,1795.0,Điện thoại Smartphone,0,0
1,1789,Điện Thoại - Máy Tính Bảng,1794,Máy tính bảng,1794.0,Máy tính bảng,1794.0,Máy tính bảng,0,0
2,1789,Điện Thoại - Máy Tính Bảng,28856,Máy đọc sách,28856.0,Máy đọc sách,28856.0,Máy đọc sách,0,0
3,1789,Điện Thoại - Máy Tính Bảng,1796,Điện thoại phổ thông,1796.0,Điện thoại phổ thông,1796.0,Điện thoại phổ thông,0,0
4,1789,Điện Thoại - Máy Tính Bảng,8061,Điện thoại bàn,8061.0,Điện thoại bàn,8061.0,Điện thoại bàn,0,0


In [89]:
category_df.columns

Index(['GroupID', 'GroupName', 'MasterCategoryID', 'MasterCategoryName',
       'CategoryID', 'CategoryName', 'SubCategoryID', 'SubCategoryName',
       'isCategory', 'isSubCategory'],
      dtype='object')

In [90]:
group = group.drop_duplicates()

master_category = category_df[["MasterCategoryID", "GroupID", "MasterCategoryName"]].drop_duplicates()
master_category = master_category.rename(columns={"MasterCategoryName": "Name"})

category = category_df[["CategoryID", "MasterCategoryID", "CategoryName", "isCategory"]].drop_duplicates()
category = category.rename(columns={"CategoryName": "Name"})

sub_category = category_df[["SubCategoryID", "CategoryID", "SubCategoryName", "isSubCategory"]].drop_duplicates()
sub_category = sub_category.rename(columns={"SubCategoryName": "Name"})

In [91]:
print(f"Success to transform {len(master_category)} master categories.")
print(f"Success to transform {len(category)} categories.")
print(f"Success to transform {len(sub_category)} sub categories.")

Success to transform 11 master categories.
Success to transform 59 categories.
Success to transform 82 sub categories.
