### LIBRARY

In [2]:
import pandas as pd
import requests
import random
import time

### SET UP THE ENVIRONMENT

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
    "Accept-Language": 'en-US,en;q=0.9',
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://tiki.vn/",
    "From": "",
    "af-ac-enc-dat": "",
    "x-api-source": "pc"
}

### EXTRACT GROUP DATA

In [None]:
URL = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"

In [None]:
response = requests.get(URL, headers=HEADERS)
time.sleep(random.uniform(3.2, 8.7))
if response.status_code == 200:
    data = response.json()
    print(f"Success to fetch {len(data["menu_block"]["items"])} groups.")
else:
    print("Failed to fetch data:", response.status_code)


### EXTRACT GROUP ID
Extract the group id = 8322, 1846, 1789

In [None]:
group = data["menu_block"]["items"]
group_list = []
for group in group:
    link = group["link"]
    group_id = link.split("/")[-1][1:]
    text = group["text"]
    
    if group_id in ["8322", "1846", "1789"]:
        group_list.append([group_id, text])

group = pd.DataFrame(group_list, columns=["GroupID", "Name"])

In [None]:
group

### EXTRACT CATEGORY HIERARCHY

In [None]:
master_category_list = []
category_list = []
sub_category_list = []

for group_id in group["GroupID"]:
    parent_url = f"https://tiki.vn/api/v2/categories?parent_id={group_id}"
    parent_response = requests.get(parent_url, headers=HEADERS)
    time.sleep(random.uniform(3.2, 8.7))
    
    if parent_response.status_code == 200:
        parent_data = parent_response.json()
        if not parent_data["data"]:
            master_category_list.append([None, group_id, group[group["GroupID"] == group_id]["Name"].item()])
        else:
            for parent_category in parent_data["data"]:
                parent_id = parent_category["id"]
                parent_name = parent_category["name"]
                master_category_list.append([parent_id, group_id, parent_name])
                
                child_url = f"https://tiki.vn/api/v2/categories?parent_id={parent_id}"
                child_response = requests.get(child_url, headers=HEADERS)
                time.sleep(random.uniform(3.2, 8.7))
                
                if child_response.status_code == 200:
                    child_data = child_response.json()
                    if not child_data["data"]:
                        category_list.append([None, parent_id, None])
                    else:
                        for child_category in child_data["data"]:
                            child_id = child_category["id"]
                            child_name = child_category["name"]
                            category_list.append([child_id, parent_id, child_name])
                            
                            # Fetch type information
                            type_url = f"https://tiki.vn/api/v2/categories?parent_id={child_id}"
                            type_response = requests.get(type_url, headers=HEADERS)
                            time.sleep(random.uniform(3.2, 8.7))
                            
                            if type_response.status_code == 200:
                                type_data = type_response.json()
                                if type_data["data"]:
                                    for type_item in type_data["data"]:
                                        type_id = type_item.get("id")
                                        type_name = type_item.get("name")
                                        sub_category_list.append([type_id, child_id, type_name])
                                else:
                                    sub_category_list.append([None, child_id, None])

master_category = pd.DataFrame(master_category_list, columns=["MasterCategoryID", "GroupID", "Name"])
category = pd.DataFrame(category_list, columns=["CategoryID", "MasterCategoryID", "Name"])
sub_category = pd.DataFrame(sub_category_list, columns=["SubCategoryID", "CategoryID", "Name"])

In [None]:
print(f"Success to fetch {len(master_category)} master categories.")
print(f"Success to fetch {len(category)} categories.")
print(f"Success to fetch {len(sub_category)} sub categories.")

### TRANSFORM

In [4]:
master_category = pd.read_csv('narrow_category_data/master_category.csv')
category = pd.read_csv('narrow_category_data/category.csv')
sub_category = pd.read_csv('narrow_category_data/sub_category.csv')

In [7]:
# Check null for each column in master_category, show Null percentage for {column}: {null_percentage:.2f}%
print("Check null for each column in master_category")
for column in master_category.columns:
    null_percentage = master_category[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
master_category.head()

Check null for each column in master_category
Null percentage for MasterCategoryID: 0.00%
Null percentage for GroupID: 0.00%
Null percentage for Name: 0.00%


Unnamed: 0,MasterCategoryID,GroupID,Name
0,320,8322,English Books
1,316,8322,Sách tiếng Việt
2,7741,8322,Văn phòng phẩm
3,18328,8322,Quà lưu niệm
4,1795,1789,Điện thoại Smartphone


In [8]:
print("Check null for each column in category")
for column in category.columns:
    null_percentage = category[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category.head()

Check null for each column in category
Null percentage for CategoryID: 2.94%
Null percentage for MasterCategoryID: 0.00%
Null percentage for Name: 2.94%


Unnamed: 0,CategoryID,MasterCategoryID,Name
0,623.0,320,Art & Photography
1,27.0,320,Biographies & Memoirs
2,4.0,320,Business & Economics
3,614.0,320,How-to - Self Help
4,7.0,320,Children's Books


In [9]:
print("Check null for each column in sub_category")
for column in sub_category.columns:
    null_percentage = sub_category[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
sub_category.head()

Check null for each column in sub_category
Null percentage for SubCategoryID: 13.95%
Null percentage for CategoryID: 0.00%
Null percentage for Name: 13.95%


Unnamed: 0,SubCategoryID,CategoryID,Name
0,625.0,623,Architecture
1,5770.0,623,Graphic Design
2,626.0,623,"Religion, Culture"
3,9423.0,623,Photography
4,112.0,623,Decorative Arts & Design
