### **This code below will crawling data of products from Tiki website**
The products' link is here: [Tiki](https://tiki.vn/dien-gia-dung/c1882)

In [4]:

import requests
import pandas as pd

### **Create the Request class for initializing and sending request**

In [30]:
class Request():
    def __init__(self, headers="", params="", cookies="") -> None:
        self.cookies = cookies
        self.headers = headers
        self.params = params
    
    @property
    def cookies(self):
        return self._cookies
    @cookies.setter
    def cookies(self, value):
        self._cookies = value
        
    @property
    def headers(self):
        return self._headers
    @headers.setter
    def headers(self, value):
        self._headers = value
        
    @property
    def params(self):
        return self._params
    @params.setter
    def params(self, value):
        self._params = value
    
    def request(self, url:str):
        resp = requests.get(url, headers=self.headers, params=self.params)
        return resp

### **Get the all of product id of each category**

#### *Initializing the Request object with `header` and `param` parameters*

In [28]:
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Referer': 'https://tiki.vn/dien-gia-dung/c1882',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'vi-VN,vi;q=0.8,en-US;q=0.5,en;q=0.3',
    'X-Guest-Token': 'r527Y1pU8EZl0BJSkKdDMwcieoAhvqFP',
    'Connection': 'keep-alive',
    'TE': 'Trailers'
}

params = {
    'limit': 40,
    'include': 'advertisement',
    'aggregations': 2,
    'version': 'home-persionalized',
    'trackity_id': 'afacc84a-95b0-912f-cae8-bee2fc1771ed',
    'category': 1882,
    'page': 1,
    'src': 'c1882',
    'urlKey': 'dien-gia-dung'
}

In [31]:
req = Request()

#### *Starting crawl the `product_id` from 50 pages*

In [42]:
prod_id_list = []
request_success_times = 0
req.headers = headers
req.params = params

for page in range(1, 51):
    req.params['page'] = page
    resp = req.request('https://tiki.vn/api/personalish/v1/blocks/listings')
    if resp.status_code == 200:
        request_success_times += 1
        for item in resp.json().get('data'):
            prod_id_list.append({'id':item.get('id')})

print(f'request successed: {request_success_times}')
pd.DataFrame(prod_id_list).shape

request successed: 50


Unnamed: 0,id
0,48962414
1,79561024
2,2393139
3,102432143
4,263488400
...,...
2025,128823103
2026,252678794
2027,17010845
2028,116875360


#### *Checking the duplication*

In [43]:
pd.DataFrame(prod_id_list).duplicated().value_counts()

False    2001
True       29
Name: count, dtype: int64

In [44]:
prod_id_list = pd.DataFrame(prod_id_list).drop_duplicates()['id'].to_list()

### **Get all information of each product**

In [48]:
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Referer': 'https://tiki.vn/may-hut-bui-lock-lock-env336blk-400w-hang-chinh-hang-p66512832.html?itm_campaign=CTP_YPD_TKA_PLA_UNK_ALL_UNK_UNK_UNK_UNK_X.285509_Y.1867829_Z.3911590_CN.AUTO---May-Hut-Bui-Lock%26Lock-ENV336BLK-%28400W%29---Hang-Chinh-Hang---2024%2F01%2F10-02%3A45%3A29&itm_medium=CPC&itm_source=tiki-ads&spid=66512833',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'vi-VN,vi;q=0.8,en-US;q=0.5,en;q=0.3',
    'X-Guest-Token': 'r527Y1pU8EZl0BJSkKdDMwcieoAhvqFP',
    'Connection': 'keep-alive',
    'TE': 'Trailers'
}

params = {
    'platform': 'web',
    'spid': 66512833,
    'version': 3
}

#### *The attributes were retrieved*
* `id`: the id of the current sell product

* `name_product`: the full name of that product

* `product_id`: the id of that product

* `brand_name`: the brand of that product

* `category`: the type of product

* `link_store`: the link references to that product's store

* `store_name`: the name of that store

* `final_price`: the official price

* `original_price`: the initial price

* `discount`: the amount of price was decreased

* `discount_rate`: the percentage of that product's discount

* `gift_item`: the additional item was attached

* `num_review`: the number of reviews

* `avg_rating`: the average of rating

* `quantity_sold`: the number of items were sold

#### *Defining a function for retrieving neccessary attributes of each product*

In [49]:
def retrive_attrs(item):
    info_prod = dict()
    info_prod['id'] = item.get('id')
    info_prod['name_product'] = item.get('name')
    info_prod['product_id'] = item.get('current_seller').get('product_id')
    info_prod['brand_name'] = item.get('brand').get('name')
    info_prod['category'] = item.get('categories').get('name')
    info_prod['store_name'] = item.get('current_seller').get('name')
    info_prod['store_link'] = item.get('current_seller').get('link')
    info_prod['final_price'] = item.get('price')
    info_prod['original_price'] = item.get('original_price')
    info_prod['discount'] = item.get('discount')
    info_prod['discount_rate'] = item.get('discount_rate')
    info_prod['gift_item'] = item.get('gift_item_title')
    info_prod['num_review'] = item.get('review_count')
    info_prod['avg_rating'] = item.get('rating_average')
    info_prod['quantity_sold'] = item.get('all_time_quantity_sold')
    # print(info_prod['quantity_sold'])
    
    return info_prod

#### *Starting crawl the information of thoes attributes*

In [50]:
prod_list = []
request_success_times = 0
req.headers = headers
req.params = params

for pid in prod_id_list:
    # req = requests.get('https://tiki.vn/api/v2/products/{}'.format(pid["id"]), headers=headers, params=params)
    resp = req.request('https://tiki.vn/api/v2/products/{}'.format(pid))
    try: 
        if resp.status_code == 200 and resp.headers["content-type"].strip().startswith("application/json"):
            # print('request successed')
            item = retrive_attrs(resp.json())
            prod_list.append(item)
            request_success_times += 1
    except Exception as ex:
        print(str(ex))
    
print(f'The number of successful requests: {request_success_times}\nThe total number of requests: {len(prod_id_list)}')
print(f'The rate of requesting successfully: {request_success_times/len(prod_id_list)}')

'NoneType' object has no attribute 'get'
The number of successful requests: 1080
The total number of requests: 2001
The rate of requesting successfully: 0.5397301349325337


#### *Checking the crawling result*

In [52]:
dienmayxanh = pd.DataFrame(prod_list)
dienmayxanh.head(5)

Unnamed: 0,id,name_product,product_id,brand_name,category,store_name,store_link,final_price,original_price,discount,discount_rate,gift_item,num_review,avg_rating,quantity_sold
0,48962414,Lò Vi Sóng Sharp R-208VN-WS (20L) - Hàng Chính...,48962415,Sharp,Điện Gia Dụng,Tiki Trading,https://tiki.vn/cua-hang/tiki-trading,1199000,1560000,361000,23,0 quà tặng kèm,293,4.7,1342.0
1,79561024,"Máy làm sữa hạt, xay sinh tố và nấu đa năng Te...",79561025,Tefal,Điện Gia Dụng,Tefal Official Store - Chính hãng,https://tiki.vn/cua-hang/tefal-official-store-...,3099000,5390000,2291000,43,1 quà tặng kèm,1029,4.7,3423.0
2,2393139,Nồi Lẩu Điện Sunhouse SHD4521 (3L) - Hàng chín...,2393141,Sunhouse,Điện Gia Dụng,Tiki Trading,https://tiki.vn/cua-hang/tiki-trading,449000,776000,327000,42,0 quà tặng kèm,612,4.7,6551.0
3,102432143,Ấm Đun Siêu Tốc Inox 2 Lớp Sunhouse SHD1351 (1...,102432144,Sunhouse,Điện Gia Dụng,Tiki Trading,https://tiki.vn/cua-hang/tiki-trading,249000,422000,173000,41,0 quà tặng kèm,2250,4.7,9326.0
4,263488400,Nồi cơm điện tử áp suất kép Cuckoo CRP-ST1010F...,263488401,Cuckoo,Nồi cơm điện tử,CUCKOO Official Store,https://tiki.vn/cua-hang/cuckoo-official-store,4290000,7990000,3700000,46,0 quà tặng kèm,1,5.0,3.0


In [53]:
dienmayxanh.duplicated().value_counts()

False    1080
Name: count, dtype: int64

In [54]:
dienmayxanh.isna().sum()

id                 0
name_product       0
product_id         0
brand_name         0
category           0
store_name         0
store_link         0
final_price        0
original_price     0
discount           0
discount_rate      0
gift_item          0
num_review         0
avg_rating         0
quantity_sold     41
dtype: int64