### Import lib

In [1]:
import pandas as pd
import requests
import math
import time

### Config header and url for request

In [2]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
    "Accept-Language": 'en-US,en;q=0.9',
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://tiki.vn/",
    "From": "",
    "af-ac-enc-dat": "",
    "x-api-source": "pc"
}

In [3]:
URL = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"

In [4]:
response = requests.get(URL, headers=HEADERS)
data = response.json()

### Retrieve Group ID

In [5]:
groups = data["menu_block"]["items"]
group_list = []

for group in groups:
    text = group["text"]
    link = group["link"]
    # Extract category ID from the link
    group_id = link.split("/")[-1][1:]  
    group_list.append([group_id, text])

group_df = pd.DataFrame(group_list, columns=["Group_ID", "Group_Name"])
group_df

Unnamed: 0,Group_ID,Group_Name
0,8322,Nhà Sách Tiki
1,1883,Nhà Cửa - Đời Sống
2,1789,Điện Thoại - Máy Tính Bảng
3,2549,Đồ Chơi - Mẹ & Bé
4,1815,Thiết Bị Số - Phụ Kiện Số
5,1882,Điện Gia Dụng
6,1520,Làm Đẹp - Sức Khỏe
7,8594,Ô Tô - Xe Máy - Xe Đạp
8,931,Thời trang nữ
9,4384,Bách Hóa Online


### Choosing the Group ID: 8322 and 17166

In [6]:
# Filtering 8322 and 17166 
filtered_df = group_df[group_df["Group_ID"].isin(["8322", "17166"])]
filtered_df

Unnamed: 0,Group_ID,Group_Name
0,8322,Nhà Sách Tiki
12,17166,Cross Border - Hàng Quốc Tế


In [7]:
parent_category_list = []
child_category_list = []
type_info_list = []

for group_id in filtered_df["Group_ID"]:
    parent_url = f"https://tiki.vn/api/v2/categories?parent_id={group_id}"   
    parent_response = requests.get(parent_url, headers=HEADERS)
    
    if parent_response.status_code == 200:
        parent_data = parent_response.json()
        if not parent_data["data"]:
            parent_category_list.append([group_id, filtered_df[filtered_df["Group_ID"] == group_id]["Name"].item(), None, None])
        else:
            for parent_category in parent_data["data"]:
                parent_id = parent_category["id"]
                parent_name = parent_category["name"]
                
                # Get group name 
                group_name = filtered_df[filtered_df["Group_ID"] == group_id]["Group_Name"].item()
                
                parent_category_list.append([group_id, group_name, parent_id, parent_name])
                
                # Now fetch child categories
                child_url = f"https://tiki.vn/api/v2/categories?parent_id={parent_id}" 
                child_response = requests.get(child_url, headers=HEADERS)
                
                if child_response.status_code == 200:
                    child_data = child_response.json()
                    if not child_data["data"]:
                        child_category_list.append([group_id, group_name, parent_id, parent_name, None, None])
                    else:
                        for child_category in child_data["data"]:
                            child_id = child_category["id"]
                            child_name = child_category["name"]
                            
                            child_category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name])
                            
                            # Fetch type information
                            if pd.isna(child_id):
                                type_info_list.append([group_id, group_name, parent_id, parent_name, None, None, None, None])
                            else:
                                type_url = f"https://tiki.vn/api/v2/categories?parent_id={child_id}"
                                type_response = requests.get(type_url, headers=HEADERS)

                                if type_response.status_code == 200:
                                    type_data = type_response.json()
                                    if type_data["data"]:
                                        for type_item in type_data["data"]:
                                            type_id = type_item.get("id")
                                            type_name = type_item.get("name")
                                            type_info_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, type_id, type_name])
                                    else:
                                        type_info_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, None, None])

type_info = pd.DataFrame(type_info_list, columns=["Group_ID", "Group_Name", "Parent_ID", "Parent_Name", "Child_ID", "Child_Name", "Type_ID", "Type_Name"])


In [8]:
type_info

Unnamed: 0,Group_ID,Group_Name,Parent_ID,Parent_Name,Child_ID,Child_Name,Type_ID,Type_Name
0,8322,Nhà Sách Tiki,320,English Books,623,Art & Photography,625.0,Architecture
1,8322,Nhà Sách Tiki,320,English Books,623,Art & Photography,5770.0,Graphic Design
2,8322,Nhà Sách Tiki,320,English Books,623,Art & Photography,626.0,"Religion, Culture"
3,8322,Nhà Sách Tiki,320,English Books,623,Art & Photography,9423.0,Photography
4,8322,Nhà Sách Tiki,320,English Books,623,Art & Photography,112.0,Decorative Arts & Design
...,...,...,...,...,...,...,...,...
503,17166,Cross Border - Hàng Quốc Tế,21442,Thời Trang,21488,Túi xách và Phụ kiện,21490.0,Vali
504,17166,Cross Border - Hàng Quốc Tế,21442,Thời Trang,21488,Túi xách và Phụ kiện,21492.0,Túi xách & Balo
505,17166,Cross Border - Hàng Quốc Tế,21442,Thời Trang,21488,Túi xách và Phụ kiện,21494.0,Phụ kiện
506,17166,Cross Border - Hàng Quốc Tế,21442,Thời Trang,25034,Phụ kiện thời trang khác,,


### Check null percentage for ID

In [9]:
columns_to_check = ['Group_ID', 'Parent_ID', 'Child_ID', 'Type_ID']

for column in columns_to_check:
    null_percentage = (type_info[column].isnull().sum() / len(type_info[column])) * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")

Null percentage for Group_ID: 0.00%
Null percentage for Parent_ID: 0.00%
Null percentage for Child_ID: 0.00%
Null percentage for Type_ID: 21.85%


### Retrieved Product ID information based on Child_ID or Type_ID

In [10]:
def retrieve_product_ids(row):
    if pd.notnull(row['Child_ID']):
        category_id = row['Child_ID']
    elif pd.notnull(row['Type_ID']):
        category_id = row['Type_ID']
    else:
        return []  # Return empty list if both Child_ID and Type_ID are null

    base_url = "https://tiki.vn/api/personalish/v1/blocks/listings"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
        "Accept-Language": 'en-US,en;q=0.9',
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Referer": "https://tiki.vn/",
        "From": "",
        "af-ac-enc-dat": "",
        "x-api-source": "pc"
    }
    params = {"category": category_id, "page": 1}

    # Make initial request to get total count
    response = requests.get(base_url, headers=headers, params=params)
    data = response.json()
    total = data["paging"]["total"]
    per_page = data["paging"]["per_page"]
    last_page = data["paging"]["last_page"]

    # Calculate number of pages needed
    num_pages = math.ceil(total / per_page)

    # Fetch data from each page
    product_ids = []
    for page in range(1, num_pages + 1):
        params = {"category": category_id, "page": page}
        response = requests.get(base_url, headers=headers, params=params)
        data = response.json()

        for item in data["data"]:
            product_ids.append(item["id"])

        # Sleep for a short duration to avoid being blocked
        time.sleep(1)  # You can adjust the duration as needed

    return product_ids


In [11]:
# Apply the function to each row in type_info and retrieve product IDs
type_info['Product_IDs'] = type_info.apply(retrieve_product_ids, axis=1)

# Explode the list of product IDs into separate rows
general_data = type_info.explode('Product_IDs')

# Store the data in general_data.csv
general_data.to_csv('general_data.csv', index=False)

ConnectTimeout: HTTPSConnectionPool(host='tiki.vn', port=443): Max retries exceeded with url: /api/personalish/v1/blocks/listings?category=2320&page=7 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000028976938530>, 'Connection to tiki.vn timed out. (connect timeout=None)'))