### LIBRARY

In [1]:
import pandas as pd
import requests
import random
import time

### SET UP THE ENVIRONMENT

In [2]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
    "Accept-Language": 'en-US,en;q=0.9',
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://tiki.vn/",
    "From": "",
    "af-ac-enc-dat": "",
    "x-api-source": "pc"
}

### EXTRACT GROUP DATA

In [3]:
URL = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"

In [4]:
response = requests.get(URL, headers=HEADERS)
time.sleep(random.uniform(3.2, 8.7))
if response.status_code == 200:
    data = response.json()
    print(f"Success to fetch {len(data["menu_block"]["items"])} groups.")
else:
    print("Failed to fetch data:", response.status_code)


Success to fetch 26 groups.


### EXTRACT GROUP ID
Extract the group id = 1846, 1789

In [5]:
group = data["menu_block"]["items"]
group_list = []
for group in group:
    link = group["link"]
    group_id = link.split("/")[-1][1:]
    text = group["text"]
    
    if group_id in ["8322", "1846", "1789"]:
        group_list.append([group_id, text])

group = pd.DataFrame(group_list, columns=["GroupID", "Name"])
print(f"Success to extract {len(group)} needed groups.")
print(group["GroupID"].values)

Success to extract 3 needed groups.
['8322' '1789' '1846']


### EXTRACT CATEGORY HIERARCHY

In [6]:
category_list = []
for group_id, group_name in zip(group["GroupID"], group["Name"]):
    parent_url = f"https://tiki.vn/api/v2/categories?parent_id={group_id}"
    parent_response = requests.get(parent_url, headers=HEADERS)
    time.sleep(random.uniform(3.2, 8.7))
    if parent_response.status_code == 200:
        parent_data = parent_response.json()
        if not parent_data["data"]:
            category_list.append([group_id, group_name, None, None, None, None, None, None])
        else:
            for parent_category in parent_data["data"]:
                parent_id = parent_category["id"]
                parent_name = parent_category["name"]
                child_url = f"https://tiki.vn/api/v2/categories?parent_id={parent_id}"
                child_response = requests.get(child_url, headers=HEADERS)
                time.sleep(random.uniform(3.2, 8.7))
                if child_response.status_code == 200:
                    child_data = child_response.json()
                    if not child_data["data"]:
                        category_list.append([group_id, group_name, parent_id, parent_name, None, None, None, None])
                    else:
                        for child_category in child_data["data"]:
                            child_id = child_category["id"]
                            child_name = child_category["name"]
                            type_url = f"https://tiki.vn/api/v2/categories?parent_id={child_id}"
                            type_response = requests.get(type_url, headers=HEADERS)
                            time.sleep(random.uniform(3.2, 8.7))
                            if type_response.status_code == 200:
                                type_data = type_response.json()
                                if type_data["data"]:
                                    for type_item in type_data["data"]:
                                        type_id = type_item.get("id")
                                        type_name = type_item.get("name")
                                        category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, type_id, type_name])
                                else:
                                    category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, None, None])

category_df = pd.DataFrame(category_list, columns=["GroupID", "GroupName", "MasterCategoryID", "MasterCategoryName", "CategoryID", "CategoryName", "SubCategoryID", "SubCategoryName"])

In [7]:
print(f"Success to fetch {len(category_df)} categories.")

Success to fetch 470 categories.


### TRANSFORM

In [8]:
def transform_row(row):
    if pd.isna(row['SubCategoryID']):
        return pd.Series([row['CategoryID'], row['CategoryName'], row['CategoryID'], row['CategoryName'], 0])
    else:
        return pd.Series([row['CategoryID'], row['CategoryName'], row['SubCategoryID'], row['SubCategoryName'], 1])

In [9]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category_df.head()

Check null for each column in category
Null percentage for GroupID: 0.00%
Null percentage for GroupName: 0.00%
Null percentage for MasterCategoryID: 0.00%
Null percentage for MasterCategoryName: 0.00%
Null percentage for CategoryID: 0.85%
Null percentage for CategoryName: 0.85%
Null percentage for SubCategoryID: 14.68%
Null percentage for SubCategoryName: 14.68%


Unnamed: 0,GroupID,GroupName,MasterCategoryID,MasterCategoryName,CategoryID,CategoryName,SubCategoryID,SubCategoryName
0,8322,Nhà Sách Tiki,320,English Books,623.0,Art & Photography,625.0,Architecture
1,8322,Nhà Sách Tiki,320,English Books,623.0,Art & Photography,5770.0,Graphic Design
2,8322,Nhà Sách Tiki,320,English Books,623.0,Art & Photography,626.0,"Religion, Culture"
3,8322,Nhà Sách Tiki,320,English Books,623.0,Art & Photography,9423.0,Photography
4,8322,Nhà Sách Tiki,320,English Books,623.0,Art & Photography,112.0,Decorative Arts & Design


In [10]:
category_df[['CategoryID', 'CategoryName', 'SubCategoryID', 'SubCategoryName', 'isSubCategory']] = category_df.apply(transform_row, axis=1, result_type='expand')

In [11]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")

Check null for each column in category
Null percentage for GroupID: 0.00%
Null percentage for GroupName: 0.00%
Null percentage for MasterCategoryID: 0.00%
Null percentage for MasterCategoryName: 0.00%
Null percentage for CategoryID: 0.85%
Null percentage for CategoryName: 0.85%
Null percentage for SubCategoryID: 0.85%
Null percentage for SubCategoryName: 0.85%
Null percentage for isSubCategory: 0.00%


In [12]:
category_df.columns

Index(['GroupID', 'GroupName', 'MasterCategoryID', 'MasterCategoryName',
       'CategoryID', 'CategoryName', 'SubCategoryID', 'SubCategoryName',
       'isSubCategory'],
      dtype='object')

In [13]:
group = group.drop_duplicates()

master_category = category_df[["MasterCategoryID", "GroupID", "MasterCategoryName"]].drop_duplicates()
master_category = master_category.rename(columns={"MasterCategoryName": "Name"})

category = category_df[["CategoryID", "MasterCategoryID", "CategoryName"]].drop_duplicates()
category = category.rename(columns={"CategoryName": "Name"})

sub_category = category_df[["SubCategoryID", "CategoryID", "SubCategoryName", "isSubCategory"]].drop_duplicates()
sub_category = sub_category.rename(columns={"SubCategoryName": "Name"})

In [15]:
print(f"Success to transform {len(master_category)} master categories.")
print(f"Success to transform {len(category)} categories.")
print(f"Success to transform {len(sub_category)} sub categories.")

Success to transform 15 master categories.
Success to transform 136 categories.
Success to transform 467 sub categories.


In [18]:
category_df.to_csv("category.csv", index=False, encoding="utf-8-sig")