In [29]:
from selenium import webdriver
from selenium.webdriver.common.by import By

import urllib.parse
import re
import time
import csv

### Helper Functions

In [3]:
def setup(chromedriver, headless=False):
    global driver, session
    opt = webdriver.ChromeOptions()
    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
    if headless:
        opt.add_argument("--headless")

    return webdriver.Chrome(executable_path=chromedriver, options=opt)

In [4]:
def convert_cookies(path):
    # For bypassing authentication by using a chrome extension "Get Cookies.txt LOCALLY" (does not work on tokopedia)
    cookies = []
    with open(path) as f:
        for index, row in enumerate(f.readlines()):
            if index > 3:
                cookies.append(
                    {"name": row.split()[-2], "value": row.split()[-1]})
    return cookies

In [5]:
def get_categories(path):
    # File with category column only
    clean_cat = []
    with open(path, encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        for row in csv_reader:
            clean_cat.append(list(row.values())[0])
    return clean_cat

### Main Code

In [6]:
# Get categories from csv file
categories = get_categories("resources/category_clean.csv")


In [None]:
# Download chromedriver.exe with the same version as your Chrome
driver = setup("resources/chromedriver/chromedriver.exe")

In [58]:
def get_details(detail_container, category, rank):
    # Scrape to get all parameters
    
    detail = dict()
    detail['rank'] = rank
    detail['category'] = category
    # Name
    try:
        name = detail_container.find_element(By.XPATH, ".//div[@data-testid='spnSRPProdName']").get_attribute("innerHTML")
        detail['name'] = name
    except Exception as e:
        ...

    # Price
    try:
        price = detail_container.find_element(By.XPATH, ".//div[@data-testid='spnSRPProdPrice']").get_attribute("innerHTML")
        price = float(re.sub('[^0-9]', '', price))
        detail['price'] = price
    except Exception as e:
        ...

    # Location
    try:
        location = detail_container.find_element(By.XPATH, ".//span[@data-testid='spnSRPProdTabShopLoc']").get_attribute("innerHTML")
        detail['location'] = location
    except Exception as e:
        ...
        
    # Rating
    try:
        rating = detail_container.find_element(By.XPATH, ".//*[contains(text(),'Terjual')]").find_element(By.XPATH, "preceding-sibling::span[2]").get_attribute("innerHTML")
        rating = float(rating)
        detail['rating'] = rating
    except Exception as e:
        detail['rating'] = 0

    # Sold
    try:
        sold = detail_container.find_element(By.XPATH, ".//span[contains(text(),'Terjual')]").get_attribute("innerHTML")
        if ("rb" in sold):
            sold = int(re.sub('[^0-9]', '', sold))
            sold = sold * 1000
        else:
            sold = int(re.sub('[^0-9]', '', sold))
        detail['sold'] = sold
    except Exception as e:
        detail['sold'] = 0
    
    return detail

In [67]:
data = [] # Reset data

In [None]:
for page in range(1, 11):
    print(f"On page {page}")
    for cat in categories:
        url_safe_cat = urllib.parse.quote(cat)
        url = f"https://www.tokopedia.com/search?st=product&q={url_safe_cat}&page={page}"
        print(f'Scraping for category {cat}..')

        driver.get(url)

        for i in range(2):
            time.sleep(1)
            containers = driver.find_elements(
                By.XPATH, "//div[@data-testid='master-product-card']")
            for index, container in enumerate(containers):
                detail_container = container.find_element(By.TAG_NAME, "div").find_element(
                    By.TAG_NAME, "div").find_elements(By.XPATH, "./div")[1].find_element(By.TAG_NAME, "a")
                data.append(get_details(detail_container,
                            cat, (80*(page-1)) + index + 1))
            driver.execute_script("window.scrollTo(0, 1000);")


In [69]:
# Remove duplicates
clean_data = [dict(t) for t in {tuple(d.items()) for d in data} if 'name' in dict(t)]
len(clean_data)

39113

In [70]:
# Save dict to csv
keys = clean_data[0].keys()

with open('resources/item_details.csv', 'w', encoding='utf-8', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(clean_data)