### LIBRARY

In [2]:
import pandas as pd
import requests
import random
import time
import math

### SET UP THE ENVIRONMENT

In [3]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
    "Accept-Language": 'en-US,en;q=0.9',
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://tiki.vn/",
    "From": "",
    "af-ac-enc-dat": "",
    "x-api-source": "pc"
}

### EXTRACT GROUP DATA

In [None]:
URL = "https://api.tiki.vn/raiden/v2/menu-config?platform=desktop"

In [None]:
response = requests.get(URL, headers=HEADERS)
time.sleep(random.uniform(3.2, 8.7))
if response.status_code == 200:
    data = response.json()
    print(f"Success to fetch {len(data["menu_block"]["items"])} groups.")
else:
    print("Failed to fetch data:", response.status_code)


### EXTRACT GROUP ID
Extract the group id = 1846, 1789

In [None]:
group = data["menu_block"]["items"]
group_list = []
for group in group:
    link = group["link"]
    group_id = link.split("/")[-1][1:]
    text = group["text"]
    
    # if group_id in ["1789"]:
    if group_id in ["1846", "1789"]:
        group_list.append([group_id, text])

group = pd.DataFrame(group_list, columns=["GroupID", "Name"])
print(f"Success to extract {len(group)} needed groups.")
print(group["GroupID"].values)

### EXTRACT CATEGORY HIERARCHY

In [None]:
category_list = []
for group_id, group_name in zip(group["GroupID"], group["Name"]):
    parent_url = f"https://tiki.vn/api/v2/categories?parent_id={group_id}"
    parent_response = requests.get(parent_url, headers=HEADERS)
    time.sleep(random.uniform(3.2, 8.7))
    if parent_response.status_code == 200:
        parent_data = parent_response.json()
        if not parent_data["data"]:
            category_list.append([group_id, group_name, None, None, None, None, None, None])
        else:
            for parent_category in parent_data["data"]:
                parent_id = parent_category["id"]
                parent_name = parent_category["name"]
                
                child_url = f"https://tiki.vn/api/v2/categories?parent_id={parent_id}"
                child_response = requests.get(child_url, headers=HEADERS)
                time.sleep(random.uniform(3.2, 8.7))
                
                if child_response.status_code == 200:
                    child_data = child_response.json()
                    if not child_data["data"]:
                        category_list.append([group_id, group_name, parent_id, parent_name, None, None, None, None])
                    else:
                        for child_category in child_data["data"]:
                            child_id = child_category["id"]
                            child_name = child_category["name"]
                            
                            type_url = f"https://tiki.vn/api/v2/categories?parent_id={child_id}"
                            type_response = requests.get(type_url, headers=HEADERS)
                            time.sleep(random.uniform(3.2, 8.7))
                            
                            if type_response.status_code == 200:
                                type_data = type_response.json()
                                if type_data["data"]:
                                    for type_item in type_data["data"]:
                                        type_id = type_item.get("id")
                                        type_name = type_item.get("name")
                                        category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, type_id, type_name])
                                else:
                                    category_list.append([group_id, group_name, parent_id, parent_name, child_id, child_name, None, None])

category_df = pd.DataFrame(category_list, columns=["GroupID", "GroupName", "MasterCategoryID", "MasterCategoryName", "CategoryID", "CategoryName", "SubCategoryID", "SubCategoryName"])

In [None]:
print(f"Success to fetch {len(category_df)} categories.")

### TRANSFORM CATEGORY HIERARCHY

In [None]:
def remove_single_category(df):
    # Group the DataFrame by MasterCategoryID and count the unique CategoryID values
    category_counts = df.groupby('MasterCategoryID')['CategoryID'].nunique()
    
    # Create a mask for MasterCategoryIDs with only one CategoryID
    single_category_mask = category_counts == 1
    
    # Drop the CategoryID and CategoryName columns for rows where there is only one CategoryID
    df.loc[df['MasterCategoryID'].isin(single_category_mask[single_category_mask].index), ['CategoryID', 'CategoryName']] = [None, None]
    
    return df

In [None]:
cleaned_df = remove_single_category(category_df)

In [None]:
def transform_category(row):
    # Handle MasterCategory and Category
    if pd.isna(row['CategoryID']):
        master_category_id = row['MasterCategoryID']
        master_category_name = row['MasterCategoryName']
        category_id = row['MasterCategoryID']
        category_name = row['MasterCategoryName']
        is_category = 0
    else:
        master_category_id = row['MasterCategoryID']
        master_category_name = row['MasterCategoryName']
        category_id = row['CategoryID']
        category_name = row['CategoryName']
        is_category = 1

    # Handle SubCategory
    if pd.isna(row['SubCategoryID']):
        sub_category_id = category_id
        sub_category_name = category_name
        is_sub_category = 0
    else:
        sub_category_id = row['SubCategoryID']
        sub_category_name = row['SubCategoryName']
        is_sub_category = 1

    return pd.Series([
        master_category_id, master_category_name, category_id, category_name,
        is_category, sub_category_id, sub_category_name, is_sub_category
    ])

In [None]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category_df.head()

In [None]:
category_df[['MasterCategoryID', 'MasterCategoryName', 'CategoryID', 'CategoryName',
             'isCategory', 'SubCategoryID', 'SubCategoryName', 'isSubCategory']] = category_df.apply(transform_category, axis=1, result_type='expand')

In [None]:
print("Check null for each column in category")
for column in category_df.columns:
    null_percentage = category_df[column].isnull().mean() * 100
    print(f"Null percentage for {column}: {null_percentage:.2f}%")
category_df.head()

In [None]:
category_df.columns

In [None]:
category_df["GroupID"] = category_df["GroupID"].astype(int)
category_df["MasterCategoryID"] = category_df["MasterCategoryID"].astype(int)
category_df["CategoryID"] = category_df["CategoryID"].astype(int)
category_df["SubCategoryID"] = category_df["SubCategoryID"].astype(int)

In [None]:
group = group.drop_duplicates()

master_category = category_df[["MasterCategoryID", "GroupID", "MasterCategoryName"]].drop_duplicates()
master_category = master_category.rename(columns={"MasterCategoryName": "Name"})

category = category_df[["CategoryID", "MasterCategoryID", "CategoryName", "isCategory"]].drop_duplicates()
category = category.rename(columns={"CategoryName": "Name"})

sub_category = category_df[["SubCategoryID", "CategoryID", "SubCategoryName", "isSubCategory"]].drop_duplicates()
sub_category = sub_category.rename(columns={"SubCategoryName": "Name"})

In [None]:
print(f"Success to transform {len(master_category)} master categories.")
print(f"Success to transform {len(category)} categories.")
print(f"Success to transform {len(sub_category)} sub categories.")

### EXTRACT PRODUCT ID THROUGH SUB-CATEGORY ID

In [None]:
master_category = pd.read_csv("category_data/master_category.csv")
master_category = pd.DataFrame(master_category)
category = pd.read_csv("category_data/category.csv")
category = pd.DataFrame(category)
sub_category = pd.read_csv("category_data/sub_category.csv")
sub_category = pd.DataFrame(sub_category)

In [None]:
def retrieve_product_ids(sub_category_id):
    base_url = "https://tiki.vn/api/personalish/v1/blocks/listings"
    PARAMS = {"category": sub_category_id, 
              "page": 1}
    response = requests.get(base_url, headers=HEADERS, params=PARAMS)
    time.sleep(random.uniform(3.2, 8.7))
    
    data = response.json()
    total_page = data["paging"]["last_page"]
    
    # Fetch data from each page
    product_ids = []
    for page in range(1, total_page + 1):
        PARAMS = {"category": sub_category_id, 
                  "page": page}
        response = requests.get(base_url, headers=HEADERS, params=PARAMS)
        time.sleep(random.uniform(3.2, 8.7))
        
        data = response.json()
        for item in data["data"]:
            product_ids.append(item["id"])

    return product_ids

In [None]:
product_df = []

for sub_category_id in sub_category['SubCategoryID']:
    product_ids = retrieve_product_ids(sub_category_id)
    for product_id in product_ids:
        product_df.append({'SubCategoryID': sub_category_id, 'ProductID': product_id})

# Convert the list of dictionaries to a DataFrame
product_ids_df = pd.DataFrame(product_df)

In [4]:
# product_ids_df.to_csv("product_ids.csv", index=False, encoding='utf-8-sig')
product_ids_df = pd.read_csv("data/product_ids.csv")
product_ids_df = pd.DataFrame(product_ids_df)

In [5]:
product_ids_df

Unnamed: 0,SubCategoryID,ProductID
0,1795,184036446
1,1795,184059211
2,1795,57809866
3,1795,120295859
4,1795,123345348
...,...,...
23821,28926,249024285
23822,28926,204425773
23823,28926,13429081
23824,28926,72843893


In [12]:
def get_product_data(row):
    product_id = row['ProductID']
    URL = f"https://tiki.vn/api/v2/products/{product_id}"
    PARAMS = {}
    
    max_retries = 5  # Maximum number of retries
    retry_delay = 20  # Delay between retries in seconds

    for attempt in range(max_retries):
        try:
            response = requests.get(URL, headers=HEADERS, params=PARAMS)
            time.sleep(random.uniform(3.2, 6.2))
            print(f"Success to fetch feedback data for {product_id}")
            break
        except requests.exceptions.ConnectionError as e:
            if attempt == max_retries - 1:
                print(f"Failed to fetch feedback data for {product_id} after {max_retries} attempts.")
                return pd.DataFrame()  # Return an empty DataFrame if all attempts fail
            print(f"Failed to fetch feedback data for {product_id}. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay + random.uniform(0, 5))  # Add a random delay to avoid synchronization issues
    
    data = response.json()

    product_data = {
        'SubCategoryID': row['SubCategoryID'],
        'ProductID': data['id'],
        'product_name': data.get('name', None),
        'product_url': data.get('short_url', None),
        'pricing_current': data.get('price', None),
        'pricing_original': data.get('original_price', None),
        'product_image_url': data.get('thumbnail_url', None),
        'inventory_status': data.get('inventory_status', None),
        'inventory_type': data.get('inventory_type', None),
        'created_date': data.get('day_ago_created', None),
        'quantity_sold': data.get('all_time_quantity_sold', None),
        'brand_id': data.get('brand', {}).get('id', None),
        'brand_name': data.get('brand', {}).get('name', None),
        'brand_slug': data.get('brand', {}).get('slug', None),
        'seller_id': data.get('current_seller', {}).get('id', 0) if data.get('current_seller') else 0,
        'seller_name': data.get('current_seller', {}).get('name', 0) if data.get('current_seller') else 0,
        'seller_link': data.get('current_seller', {}).get('link', 0) if data.get('current_seller') else 0,
        'seller_image_url': data.get('current_seller', {}).get('logo', 0) if data.get('current_seller') else 0
    }

    category_id = data['breadcrumbs'][-2]['category_id']
    product_data['category_id'] = category_id
    
    return product_data

In [13]:
product = product_ids_df.apply(get_product_data, axis=1, result_type='expand')

# Convert the result to a DataFrame
product_df = pd.DataFrame(product)

Success to fetch feedback data for 184036446
Success to fetch feedback data for 184059211
Success to fetch feedback data for 57809866
Success to fetch feedback data for 120295859
Success to fetch feedback data for 123345348
Success to fetch feedback data for 121744434
Success to fetch feedback data for 189466001
Success to fetch feedback data for 125590981
Success to fetch feedback data for 126050455
Success to fetch feedback data for 172329599
Success to fetch feedback data for 29931445
Success to fetch feedback data for 181266509
Success to fetch feedback data for 140184098
Success to fetch feedback data for 192859201
Success to fetch feedback data for 181918564
Success to fetch feedback data for 197966909
Success to fetch feedback data for 155833852
Success to fetch feedback data for 143359355
Success to fetch feedback data for 151463645
Success to fetch feedback data for 158895857
Success to fetch feedback data for 140144656
Success to fetch feedback data for 147969070
Success to f

KeyboardInterrupt: 

In [None]:
product_df.to_csv("product_data.csv", index=False, encoding='utf-8-sig')