# Crawler Task

As Backend Engineer, we would like you to create a programming Utility in your
favorite programming language (preferably JAVA) that extracts top 100 products of
category Mobile phones/Handphone from Tokopedia and stores it in a csv file. It
should include the following information:
1. Name of Product
2. Description
3. Image Link
4. Price
5. Rating (out of 5 stars)
6. Name of store or merchant

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
URL = "https://www.tokopedia.com/p/handphone-tablet/handphone"

In [3]:
# Some parameter to update sorting mechanism
# query_params = {
#     "terbaru": "ob=9",
#     "ulasan": "ob=5",
#     "sesuai": "ob=23",
# }

In [4]:
params = {
    "ob": 23,  # sort by 'sesuai'
    "page": 1, # avaialble up to 50 ish 
}

In [5]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8", 
    "Dnt": "1", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", 
}

## Gathering product urls

In [6]:
from urllib.parse import unquote

class ProductListingPageSoup:
    def __init__(self, content):
        self.parsed = BeautifulSoup(content, "html.parser")
        
    def product_urls(self):
        result_urls = []
        
        product_list_soup = self.parsed.find_all("div", class_="css-bk6tzz e1nlzfl3")
        for p in product_list_soup:
            url = self._parse_product_get_url(p)
            if self._is_promo_url(url):
                result_urls.append(self._parse_promo_url(url))
            else:
                result_urls.append(url)
                
        return result_urls
        
    def _parse_product_get_url(self, soup):
        return soup.find("a")["href"]
        
    def _is_promo_url(self, url):
        return url[:30] == "https://ta.tokopedia.com/promo"
    
    def _parse_promo_url(self, url):
        real_url_idx = url.find("r=")
        real_url = url[real_url_idx+2:]
        return unquote(real_url)

In [7]:
NUMBER_OF_PAGES = 15

product_urls = []
for i in tqdm(range(1, NUMBER_OF_PAGES+1)):
    params["page"] = i
    r = requests.get(URL, headers=headers, params=params)
    product_listing = ProductListingPageSoup(r.content)
    product_urls += product_listing.product_urls()

100%|██████████| 15/15 [00:24<00:00,  1.65s/it]


In [8]:
# sample product urls  
print(len(product_urls))
product_urls[:5]

100


['https://www.tokopedia.com/newrizkyapple/iphone-13-pro-max-dual-5g-1tb-512gb-256gb-128gb-pro-not-mini-12-14-13-pro-128gb-graphite-single?src=topads&page=1&ob=23&src=directory&management_type=1',
 'https://www.tokopedia.com/studioponsel/ibox-iphone-13-128gb-dual-sim-nano-mini-red-black-blue-128-not-12-13-mini-single-red?src=topads&management_type=1&src=directory&ob=23',
 'https://www.tokopedia.com/pratamaponsel112/infinix-hot-10s-6-128-garansi-resmi-purple?src=topads&ob=23&src=directory&management_type=1&t=desktop',
 'https://www.tokopedia.com/studioponsel/google-pixel-6-6-pro-5g-12gb-128gb-8gb-256gb-black-coral-seafoam-stormy-black-pixel-6-8-256gb?src=topads&ob=23&page=1',
 'https://www.tokopedia.com/studioponsel/ibox-iphone-13-mini-512gb-dual-256gb-128gb-purple-white-black-blue-12-128gb-inter-blue?src=topads&page=1&ob=23&src=directory&management_type=1&t=desktop']

## Navigate each product page, to find product details

In [19]:
class ProductPageSoup:
    """
    Wrapper class for GET product page response
    """
    def __init__(self, content):
        self.parsed = BeautifulSoup(content, "html.parser")
        self.parsed_str = str(self.parsed)
        
    def title(self):
        return self.parsed.find("h1").text
    
    def price(self):
        return self.parsed.find("div", class_="price").text
    
    def main_image_url(self):
        return self.parsed.find("div", class_="css-1nchjne").img["src"]
    
    def store_name(self):
        nameStartIdx = self.parsed_str.find("shopName") + 11
        nameEndIdx = self.parsed_str.find('"', nameStartIdx)
        return self.parsed_str[nameStartIdx:nameEndIdx]
    
    def rating(self):
        try:
            return self.parsed.find("meta", {"itemprop": "ratingValue"})["content"]
        except Exception as e:
            return "no rating"
    
    def description(self):
        desc = self._parse_description()
        listical_desc = self._parse_description_listical(desc)
        block_desc = self._parse_description_block(desc)
        
        if len(listical_desc) > 0:
            return listical_desc + "\n\n" + block_desc
        
        return block_desc
    
    def _parse_description(self):
        return self.parsed.find("div", role="tabpanel")

    def _parse_description_listical(self, soup):
        try:
            out = [x.text for x in soup.ul.find_all("li")]
            return "\n".join(out)
        except Exception as e:
            return ""

    def _parse_description_block(self, soup):
        try:
            return soup.find("div", {"data-testid": "lblPDPDescriptionProduk"}).text
        except Exception as e:
            return ""

### Demo parsing product page

In [10]:
sample_url = 'https://www.tokopedia.com/alteccellular/samsung-galaxy-a52-8-128-ram-8gb-rom-128gb-garansi-resmi-sein-black?whid=0'
r = requests.get(sample_url, headers=headers)
sample_product = ProductPageSoup(r.content)

In [11]:
sample_product.title()

'Samsung Galaxy A52 8/128 RAM 8GB ROM 128GB GARANSI RESMI SEIN - BLACK'

In [12]:
sample_product.description()

'Kondisi: Baru\nBerat: 500 Gram\nKategori: Android OS\nEtalase: SAMSUNG\n\nGaransi Resmi 1 Tahun Samsung Indonesia100% Original Baru dan SegelImei Terdaftar Resmi di KemenperinLayarDimensi\t159.9 x 75.1 x 8.4 mmBerat\t187 gUkuran Layar\t6.5 inches, Super AmoledResolusi Layar\t1080 x 2400 pixelsRasio Layar\t 20:9Kerapatan Layar\t~405 ppiFitur\tCorning Gorilla GlassHardware & Software Chipset\tQualcomm SM7225 Snapdragon 750G (8 nm)Processor\tOcta-core (2×2.2 GHz Kryo 570 & 6×1.8 GHz Kryo 570)GPU\tAdreno 619Sistem Operasi\tAndroid 11User Interface\tOne UI 3.0Ram / Memori Internal8GB / 128GBMemori Eksternal\tmicroSDXCKamera BelakangResolusi\t48 MP, f/2.0, 26mm (wide), 1/2.0″, 0.8µm, PDAF12 MP, f/2.2, 123˚ (ultrawide)5 MP, f/2.4, (macro)5 MP, f/2.2, (depth)Fitur\tLED flash, panorama, HDR, Video\t4K@30fps, 1080p@30/120fps; gyro-EISKamera DepanResolusi\t32 MP, f/2.2, 26mm (wide), 1/2.8″, 0.8µmFitur\tHDR, Video 4K@30fps, 1080p@30fpsKonektivitasSim Card\tDual-SIMInternet\tHSPA, LTE-A, Wi-Fi\tWi

In [13]:
sample_product.main_image_url()

'https://images.tokopedia.net/img/cache/500-square/VqbcmM/2021/4/18/b649ae51-57b4-47b4-9746-f596c9486a95.jpg.webp?ect=4g'

In [14]:
sample_product.rating()

'5.0'

In [15]:
sample_product.price()

'Rp4.798.000'

In [16]:
sample_product.store_name()

'Altec Cellular'

## Get product details for all url founds

In [20]:
product_details = []
for url in tqdm(product_urls):
    r = requests.get(url, headers=headers)
    product_page = ProductPageSoup(r.content)
    product_details.append({
        "title": product_page.title(),
        "description": product_page.description(),
        "image_link": product_page.main_image_url(),
        "price": product_page.price(),
        "rating": product_page.rating(),
        "store_name": product_page.store_name(),
    })

100%|██████████| 100/100 [01:03<00:00,  1.57it/s]


## Write to CSV

In [22]:
import csv

filename = "output.csv"
f = open(filename, 'w')

# create the csv writer
writer = csv.writer(f)

# write header
writer.writerow(["title","description","image_link","price","rating","store_name"])

# write data
for p in product_details:
    writer.writerow([p["title"], p["description"], p["image_link"], p["price"], p["rating"], p["store_name"]])

# close the file
f.close()