In [None]:
import pandas as pd
import json
from crawlbase import CrawlingAPI
from urllib.parse import quote
import os
import datetime
import time

with open('tokens.json') as f:
    tokens = json.load(f)
normal_token = tokens["normal_token"]
js_token = tokens["js_token"]

# Amazon

Has dedicated scraper

In [None]:
import requests
from requests.adapters import HTTPAdapter

def scrape_url_individual(url):
    
    
    s = requests.Session()
    s.mount(
        f'https://api.crawlbase.com/scraper?token={js_token}&url=' + url,
        HTTPAdapter(max_retries=5)
    )
    url = quote(url)
    res = requests.get(
        f'https://api.crawlbase.com/scraper?token={js_token}&url=' + url, timeout=60
    )
    return res.json()

def remove_special_chars(mystr):
    return ''.join(letter for letter in mystr if letter.isalnum())

In [None]:
search_terms = []

# Initialize the Crawling API with your Crawlbase token
api = CrawlingAPI({ 'token': js_token })
page_limit = 50
starting_page = 1
for search_term in search_terms:
    source_name = "amazon"
    folder_name = search_term
    ct = datetime.datetime.now()
    if not os.path.exists(f"extracts/{source_name}/{folder_name}/items"):
        os.mkdir(f"extracts/{source_name}/{folder_name}")
        os.mkdir(f"extracts/{source_name}/{folder_name}/items")

    # URL of the Amazon search page you want to scrape
    base_url = f'https://www.amazon.com/s?k={search_term}'
    amazon_search_url = f'https://www.amazon.com/s?k={search_term}&page={starting_page}'

    # options for Crawling API
    options = {
    'page_wait': 2000,
    'ajax_wait': 'true',
    'scraper': 'amazon-serp'
    }

    def get_reviews(url):
        url = '/'.join(url.split("/")[:-1])
        print(f"scraping {url}")
        tries = 0
        while tries < 50:
            try:
                print(f"number of tries: {tries}")
                tries += 1
                product_info = scrape_url_individual(url)
                break
            except:
                print(f"trying to get info on {url} again")
                print(f"number of tries: {tries}")
                time.sleep(10)
        if product_info:
            product_name = url.split("/")[-1]
            with open(f"extracts/{source_name}/{folder_name}/items/amazon_{product_name[:30]}.json", "w") as outfile:
                json_object = json.dumps(product_info, indent=4)
                outfile.write(json_object)

    product_data = []

    def scrape_url(url):
        tries = 0
        while tries < 50:
            try:
                print(f"number of tries: {tries}")
                tries += 1
                response = api.get(url, options)
                break
            except:
                print(f"trying to get info on {url} again")
                print(f"number of tries: {tries}")
                time.sleep(10)
        if response['status_code'] == 200:
            response_json = json.loads(response['body'].decode('latin1'))
            scraper_result = response_json['body']
            products = scraper_result.get("products", [])
            for product in products:
                product_info = {
                    "url": product.get("url", ""),
                    "name": product.get("name", ""),
                    "asin": product.get("asin", ""),
                    "image": product.get("image", ""),
                    "price": product.get("price", ""),
                    "isPrime": product.get("isPrime", ""),
                    "offer": product.get("offer", ""),
                    "customerReview": product.get("customerReview", ""),
                    "customerReviewCount": product.get("customerReviewCount", ""),
                }
                product_data.append(product_info)
                get_reviews(product.get("url", ""))
            pagination = scraper_result.get("pagination")
            return pagination
        else:
            print("Failed to retrieve the page. Status code:", response['status_code'])
            return None

    print(f"starting crawler for {source_name}/{folder_name}")
    print(f"scraping on {amazon_search_url}")
    log_str = f"starting crawler for {source_name}/{folder_name}"
    log_str = log_str + "\n" + f"scraping on {amazon_search_url}"

    pagination_info = scrape_url(amazon_search_url)

    if pagination_info:
        total_pages = pagination_info.get('totalPages', 1)
        if isinstance(total_pages, int):
            for page_number in range(starting_page + 1, total_pages + 1):
                print(f"going through page number: {page_number}")
                log_str = log_str + "\n" + f"going through page number: {page_number}"
                page_url = f'{base_url}&page={page_number}'
                print(f"executing scraper on: {page_url}")
                log_str = log_str + "\n" + f"executing scraper on: {page_url}"
                scrape_url(page_url)
                if page_number == page_limit: # remove when you want to scrape everything (unadvisable)
                    print(f"page limit reached: {page_number}")
                    log_str = log_str + "\n" + f"page limit reached: {page_number}"
                    break
    clean_ct = str(ct).replace(':', '_').replace('-', '').replace(' ', '_')
    log_file_name = f"extracts/{source_name}/{folder_name}/log_{source_name}_{clean_ct}"
    print(f"writing log file to {log_file_name}.txt")
    with open(log_file_name + '.txt', "w") as logfile:
        logfile.write(log_str)
    df = pd.DataFrame(product_data)
    df.to_csv(f"{log_file_name}.csv", index=False)
    print(df.shape)
    display(df.head(5))
    starting_page = 1