In [1]:
import pandas as pd
import requests
import random
import time

# Set up header
HEADERS = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
  "Accept-Language": 'en-US,en;q=0.9',
  "Accept-Encoding": "gzip, deflate, br, zstd",
  "Referer": "https://tiki.vn/",
  "From": "",
  "af-ac-enc-dat": "",
  "x-api-source": "pc"
}


In [2]:
data = pd.read_csv('brand.csv')
data = pd.DataFrame(data)
data

Unnamed: 0,SubCategoryID,ProductID,BrandName
0,1795,184036446,Apple
1,1795,184059211,Apple
2,1795,57809866,Xiaomi
3,1795,120295859,Nokia
4,1795,123345348,Apple
...,...,...,...
23757,28926,249024285,OEM
23758,28926,204425773,OEM
23759,28926,13429081,OEM
23760,28926,72843893,OEM


In [3]:
apple = data[data['BrandName'] == 'Apple']
asus = data[data['BrandName'] == 'Asus']
hp = data[data['BrandName'] == 'HP']
samsung = data[data['BrandName'] == 'Samsung']

print('Number of Apple:', len(apple))
print('Number of Asus:', len(asus))
print('Number of HP:', len(hp))
print('Number of Samsung:', len(samsung))

# Apple: 14 - 132 // Actual: 4862+
# Asus: 212 - 292
# HP: 332 - 175
# Samsung: 287 - 1135

Number of Apple: 15
Number of Asus: 210
Number of HP: 501
Number of Samsung: 311


In [4]:
def extract_product_data(product_ids_df):
    product_data_list = []

    for _, row in product_ids_df.iterrows():
        product_id = row['ProductID']
        URL = f"https://tiki.vn/api/v2/products/{product_id}"
        PARAMS = {}
        response = requests.get(URL, headers=HEADERS, params=PARAMS)
        time.sleep(random.uniform(3.2, 4.7))
        data = response.json()

        product_data = {
            'product_id': data['id'],
            'product_name': data.get('name', None),
            'product_url': data.get('short_url', None),
            'pricing_current': data.get('price', None),
            'pricing_original': data.get('original_price', None),
            'product_image_url': data.get('thumbnail_url', None),
            'inventory_status': data.get('inventory_status', None),
            'inventory_type': data.get('inventory_type', None),
            'created_date': data.get('day_ago_created', None),
            'quantity_sold': data.get('all_time_quantity_sold', None),
            'brand_id': data.get('brand', {}).get('id', None),
            'brand_name': data.get('brand', {}).get('name', None),
            'brand_slug': data.get('brand', {}).get('slug', None),
            'seller_id': data.get('current_seller', {}).get('id', 0) if data.get('current_seller') else 0,
            'seller_name': data.get('current_seller', {}).get('name', 0) if data.get('current_seller') else 0,
            'seller_link': data.get('current_seller', {}).get('link', 0) if data.get('current_seller') else 0,
            'seller_image_url': data.get('current_seller', {}).get('logo', 0) if data.get('current_seller') else 0,
            'category_id': data['categories']['id'] if 'categories' in data and data['categories'].get('is_leaf', False) else data['breadcrumbs'][-2]['category_id'] if 'breadcrumbs' in data and len(data['breadcrumbs']) >= 2 else None,
        }
        
        print(f"Fetched data for product {product_id}")

        product_data_list.append(product_data)

    print(f"Success fetching data for {len(product_data_list)} products")

    # Convert the product data list to a DataFrame
    product_data_df = pd.DataFrame(product_data_list)

    return product_data_df

In [5]:
def extract_feedback_data(product_df):
    feedback_data_list = []
    for _, row in product_df.iterrows():
        sub_category_id = row['SubCategoryID']
        product_id = row['ProductID']
        URL = "https://tiki.vn/api/v2/reviews"
        PARAMS = {"limit": 20, "spid": sub_category_id, "product_id": product_id}
        response = requests.get(URL, headers=HEADERS, params=PARAMS)
        data = response.json()
        total_pages = data.get("paging", {}).get("last_page", 1)
        # Fetch data from each page
        for page in range(1, total_pages + 1):
            PARAMS["page"] = page
            response = requests.get(URL, headers=HEADERS, params=PARAMS)
            time.sleep(random.uniform(3.2, 4.7))
            data = response.json()
            stars = data.get("stars", {})
            OneStarCount = stars.get("1", {}).get("count", 0)
            TwoStarCount = stars.get("2", {}).get("count", 0)
            ThreeStarCount = stars.get("3", {}).get("count", 0)
            FourStarCount = stars.get("4", {}).get("count", 0)
            FiveStarCount = stars.get("5", {}).get("count", 0)
            reviews_count = data.get("reviews_count", 0)
            review_data = data.get("data", [])
            for review in review_data:
                review_id = review.get("id")
                review_title = review.get("title")
                review_content = review.get("content")
                review_upvote = review.get("thank_count", 0)
                review_rating = review.get("rating")
                review_created_at = review.get("created_at")
                reviewer = review.get("created_by", {})
                if reviewer is not None:
                    user_id = reviewer.get("id")
                    username = reviewer.get("name")
                    joined_time = reviewer.get("created_time")
                    total_reviews = reviewer.get("contribute_info", {}).get("summary", {}).get("total_review", 0)
                    total_upvotes = reviewer.get("contribute_info", {}).get("summary", {}).get("total_thank", 0)
                else:
                    user_id = None
                    username = None
                    joined_time = None
                    total_reviews = 0
                    total_upvotes = 0
                feedback_data_list.append([product_id, OneStarCount, TwoStarCount, ThreeStarCount, FourStarCount, FiveStarCount, reviews_count, review_id, review_title, review_content, review_upvote, review_rating, review_created_at, user_id, username, joined_time, total_reviews, total_upvotes])
    print(f"Success fetching data for {len(feedback_data_list)} feedbacks")
    feedback_df = pd.DataFrame(feedback_data_list, columns=["ProductID", "OneStarCount", "TwoStarCount", "ThreeStarCount", "FourStarCount", "FiveStarCount", "reviews_count", "review_id", "review_title", "review_content", "review_upvote", "review_rating", "review_created_at", "user_id", "username", "joined_time", "total_reviews", "total_upvotes"])
    return feedback_df

In [6]:
# Extract product data for Apple products
apple_product_data = extract_product_data(apple)
print(f"Number of Apple product data: {len(apple_product_data)}")

Fetched data for product 184036446
Fetched data for product 184059211
Fetched data for product 123345348
Fetched data for product 197214029
Fetched data for product 271966786
Fetched data for product 197214015
Fetched data for product 271973414
Fetched data for product 271967379
Fetched data for product 271972435
Fetched data for product 183330021
Fetched data for product 184248127
Fetched data for product 184253251
Fetched data for product 205815576
Fetched data for product 124742926
Fetched data for product 189371737
Success fetching data for 15 products
Number of Apple product data: 15
