In [1]:
import requests
import time
import random
import pandas as pd
from tqdm import tqdm
from datetime import datetime

## Crawl product data from tiki.vn

In [2]:
urlKey_lst = ['dien-thoai-smartphone', 'may-tinh-bang', 'may-doc-sach', 'dien-thoai-pho-thong', 'dien-thoai-ban', \
             'laptop-truyen-thong', 'macbook-imac', 'laptop-gaming', 'laptop-2-trong-1']
category_lst = ['1795', '1794', '28856', '1796', '8061', '29010', '2458', '5584', '29008']
src_lst = ['c' + elem for elem in category_lst]
last_page = [7, 2, 2, 1, 2, 11, 1, 2, 1]

In [3]:
product_tpls = list(zip(urlKey_lst, category_lst, src_lst, last_page))
product_tpls

[('dien-thoai-smartphone', '1795', 'c1795', 7),
 ('may-tinh-bang', '1794', 'c1794', 2),
 ('may-doc-sach', '28856', 'c28856', 2),
 ('dien-thoai-pho-thong', '1796', 'c1796', 1),
 ('dien-thoai-ban', '8061', 'c8061', 2),
 ('laptop-truyen-thong', '29010', 'c29010', 11),
 ('macbook-imac', '2458', 'c2458', 1),
 ('laptop-gaming', '5584', 'c5584', 2),
 ('laptop-2-trong-1', '29008', 'c29008', 1)]

In [4]:
def parser_product(json):
    try:
        d = dict()
        d['id'] = json.get('id')
        d['product_name'] = json.get('name')
        d['price(vnd)'] = json.get('price')
        d['original_price'] = json.get('original_price')
        d['discount'] = json.get('discount')
        d['discount_rate(%)'] = json.get('discount_rate')
        d['review_count'] = json.get('review_count')
        d['rating_average'] = json.get('rating_average')
        d['quantity_sold'] = json.get('quantity_sold').get('value')
        d['brand_name'] = json.get('visible_impression_info').get('amplitude').get('brand_name')
        d['origin'] = json.get('visible_impression_info').get('amplitude').get('origin')
        
    except:
        pass
        
    return d

In [5]:
product_data_lst = []
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

for tpl in tqdm(product_tpls, total=len(product_tpls)):
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
                            Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'vi-VN,vi;q=0.8,en-US;q=0.5,en;q=0.3',
            'Referer': 'https://tiki.vn/?src=header_tiki',
            'x-guest-token': '1QJlEwfALPxDmIkXyaFVheKH73SNqTtp',
            'Connection': 'keep-alive',
            'TE': 'Trailers',
        }
        
    params = {     # in payload
        'limit': '40',
        'include': 'advertisement',
        'aggregations': '2',
        'trackity_id': 'e754a09e-ead7-eac2-8f8e-44b00fa8f486',
        'category': tpl[1],
        'page': '1',
        'src': tpl[2],
        'urlKey':  tpl[0],
    }


    for i in range(1, tpl[3] + 1):
        params['page'] = i
        response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)
        # response = requests.get('https://tiki.vn/api/v2/products', headers=headers, params=params)
        if response.status_code == 200:
            print('request successed!!!')
            for product_record in response.json().get('data'):
                product_data = parser_product(product_record)
                product_data['subcategory_name'] = tpl[0]
                product_data['updated_at'] = current_time
                product_data_lst.append(product_data)
                # print(product_record.get('id'))
        else: 
            print('request failed!!!')
            continue
        time.sleep(random.randrange(3, 10))

  0%|          | 0/9 [00:00<?, ?it/s]

request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!


 11%|█         | 1/9 [00:52<06:57, 52.21s/it]

request successed!!!
request successed!!!


 22%|██▏       | 2/9 [01:04<03:21, 28.84s/it]

request successed!!!
request successed!!!


 33%|███▎      | 3/9 [01:18<02:13, 22.19s/it]

request successed!!!


 44%|████▍     | 4/9 [01:24<01:18, 15.67s/it]

request successed!!!
request successed!!!


 56%|█████▌    | 5/9 [01:36<00:57, 14.46s/it]

request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!
request successed!!!


 67%|██████▋   | 6/9 [02:53<01:46, 35.48s/it]

request successed!!!


 78%|███████▊  | 7/9 [03:00<00:52, 26.34s/it]

request successed!!!
request successed!!!


 89%|████████▉ | 8/9 [03:10<00:21, 21.15s/it]

request successed!!!


100%|██████████| 9/9 [03:18<00:00, 22.04s/it]


## Data cleansing

### Drop duplicated values and fill missing values

In [6]:
df_product_data = pd.DataFrame(product_data_lst)
df_product_data = df_product_data.drop_duplicates()
df_product_data = df_product_data.sort_values(by=['id', 'price(vnd)', 'discount'])
df_product_data = df_product_data.drop_duplicates(subset='id', keep='last')

In [7]:
df_product_data['quantity_sold'] = df_product_data['quantity_sold'].fillna(0)

### Categorize product data

In [8]:
phone_lst = ['dien-thoai-smartphone', 'dien-thoai-pho-thong', 'dien-thoai-ban']
tablet_lst = ['may-tinh-bang', 'may-doc-sach']
laptop_lst = ['laptop-truyen-thong', 'macbook-imac', 'laptop-gaming', 'laptop-2-trong-1']

In [9]:
# Create a new column based on product_type
def categorize_product(product_type):
    if product_type in phone_lst:
        return 'Điện thoại'
    elif product_type in tablet_lst:
        return 'Máy tính bảng'
    else:
        return 'Laptop'  # For products not in either list

In [10]:
df_product_data['category_name'] = df_product_data['subcategory_name'].apply(categorize_product)

In [11]:
df_product_data.head()

Unnamed: 0,id,product_name,price(vnd),original_price,discount,discount_rate(%),review_count,rating_average,quantity_sold,brand_name,origin,subcategory_name,category_name
230,4048043,Điện Thoại Bàn Panasonic KX-TS500,334000,369000,35000,9,25,4.5,315.0,Panasonic,,dien-thoai-ban,Điện thoại
233,4048581,Điện Thoại Bàn Panasonic KX-TSC11,714000,759000,45000,6,9,4.5,66.0,Panasonic,,dien-thoai-ban,Điện thoại
223,11251711,Điện thoại bàn không dây Panasonic KX-TGD312 -...,2190000,2190000,0,0,2,5.0,11.0,Panasonic,,dien-thoai-ban,Điện thoại
224,11251770,Điện thoại bàn không dây Panasonic KX-TGD310 -...,1430000,1430000,0,0,0,0.0,4.0,Panasonic,,dien-thoai-ban,Điện thoại
205,11251951,Điện thoại bàn không dây Panasonic KX-TGB110-H...,890000,890000,0,0,1,5.0,17.0,Panasonic,,dien-thoai-ban,Điện thoại


In [12]:
df_product_data.to_csv('tiki_product_data.csv', index=False, encoding='utf-8-sig')