First we will send a get request to the amazon API to retrieve the search results of a query therefore we will need to install the requests library and we need the url for the search

In [None]:
pip install requests

We replace the spaces in the search query (Keywords) by '+' 

In [1]:
search_query = 'Refrigerator'.replace(' ', '+')
# search_query = 't-shirt women'.replace(' ', '+') --> 't-shirt+women'

In [4]:
# The url of the search has always this standard format
#  (with some optional extensions sometimes)
search_url = f"https://www.amazon.com/s?k={search_query}&page=1"
print(search_url)

https://www.amazon.com/s?k=Refrigerator&page=1


In [5]:
import requests

# The headers are useful to avoid getting blocked and to encode the content 
HEADERS = {
    "accept-language": "en-US,en;q=0.9",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-encoding": "gzip, deflate, br",
}

# the get request returns a response object which has a content and text methods

response = requests.get(search_url, headers=HEADERS) # retrieve the results from the first page
# check the type of the object
print(type(response))
# check the content and text methods
# print(response.content)
print(response.text)

<class 'requests.models.Response'>
<!doctype html><html lang="en-us" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:csm:head-open-part1 -->

<script type='text/javascript'>var ue_t0=ue_t0||+new Date();</script>
<!-- sp:end-feature:csm:head-open-part1 -->
<!-- sp:feature:cs-optimization -->
<meta http-equiv='x-dns-prefetch-control' content='on'>
<link rel="dns-prefetch" href="https://images-na.ssl-images-amazon.com">
<link rel="dns-prefetch" href="https://m.media-amazon.com">
<link rel="dns-prefetch" href="https://completion.amazon.com">
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:csm:head-open-part2 -->
<script type='text/javascript'>
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.now||funct

In [None]:
pip install httpx parsel loguru

The parse_search  parses the items of any single page (of the response) of the search results but it skips the ads(sponsored results)

[Documentation about the parsel library](https://parsel.readthedocs.io/en/v1.0.1/parsel.html)

In [6]:
# The Selector module parses the response via css and httpx selectors which are usually used to style the html web page
from parsel import Selector
# The logger is used to show the colorful text in the run shell which gives information about the results and debugs the code
from loguru import logger as log
# The urljoin can be used to join urls after splitting them and to parse them
from urllib.parse import urljoin 

# This function will parse the response page using the Selector
# as an alternative of the beautiful soap
# it takes any response page as an argument and returns  a list of dictionaries 
# of the titles and urls which we will use later to get the asin of the products and get the reviews
def parse_search(resp):
    """Parse search result page for product previews"""
    previews = []
    sel = Selector(text=resp.text)

    # find boxes of each product preview 
    
    # Open the developer tool and inspect the results they will be 
    # inside div boxes with a class selector s-result-item)
    product_boxes = sel.css("div.s-result-item[data-component-type=s-search-result]")

    for box in product_boxes:
        # get the url of every search item in the search result
        url = urljoin(str(resp.url), box.css("h2>a::attr(href)").get()).split("?")[0]

        # print(urljoin(str(resp.url), box.css("h2>a::attr(href)").get()).split("/"))
        # asin = urljoin(url, box.css("h2>a::attr(href)").get()).split("/")[5]
        # print(asin)
        if len(urljoin(str(resp.url), box.css("h2>a::attr(href)").get()).split("/"))!=6 and "/slredirect/" not in url and "sspa" not in url:  # skip ads etc.
            # asin = urljoin(url, box.css("h2>a::attr(href)").get()).split("/")[5]
            previews.append(
                {
                    "url": url,
                    "title": box.css("h2>a>span::text").get(),
                    # "asin" : asin
                }
            )
    log.debug(f"found {len(previews)} product listings in {resp.url}") # formulate the summery and debug log report
    return previews

In [7]:
# main scope call the function to run it
response = requests.get(search_url, headers=HEADERS)
parse_search(response)

2022-10-14 01:16:12.811 | DEBUG    | __main__:parse_search:39 - found 16 product listings in https://www.amazon.com/s?k=Refrigerator&page=1


[{'url': 'https://www.amazon.com/Galanz-Retro-Refrigerator-Mounted-STAR/dp/B07QYXCFLW/ref=sr_1_2',
  'title': 'Galanz GLR10TBKEFR True Top Freezer Retro Refrigerator Frost Free, Dual Door Fridge, Adjustable Electrical Thermostat Control, Black, 10.0 Cu Ft'},
 {'url': 'https://www.amazon.com/Frigidaire-CUREFR331BK-Cubic-ft-Eraser-Fridge/dp/B07NJ8SM6D/ref=sr_1_3',
  'title': 'FRIGIDAIRE EFR331-BLACK 3.2 Cu ft Eraser Board Mini Compact Dorm Fridge (Black)'},
 {'url': 'https://www.amazon.com/Frigidaire-EFR376-BLACK-Fridge-Bottle-Opener/dp/B07LDWLNVF/ref=sr_1_4',
  'title': 'FRIGIDAIRE EFR376-BLACK 3.1 Cu Ft Black Retro Bar Fridge with Side Bottle Opener'},
 {'url': 'https://www.amazon.com/WHD-113FSS1-Freezer-Adjustable-Refrigerator-Stainless/dp/B00MWXSFM8/ref=sr_1_5',
  'title': 'Midea WHD-113FSS1 Compact Refrigerator, 3.1 cu ft, Stainless Steel'},
 {'url': 'https://www.amazon.com/Frigidaire-EFMIS129-Portable-Personal-Freon-Free/dp/B07KZLJ7PB/ref=sr_1_6',
  'title': 'Frigidaire RED EFMIS12

Now we need to get the results for the other pages.
- We need to specify how many pages are they in total
- we need to loop over those pages
The reference had a bug in getting the total number of the results which have been fixed below
The Function **search** is going to do this it takes a search query as argument and append the results to the list of ..

In [8]:
# async def search(query, session):
async def search(query):
    
    log.info(f"{query}: scraping first page")

    # first, let's scrape first query page to find out how many pages we have in total:

    # first_page = await session.get(f"https://www.amazon.com/s?k={query}&page=1")
    search_url = f"https://www.amazon.com/s?k={query}&page=1"
    first_page = requests.get(search_url, headers=HEADERS)
    sel = Selector(text=first_page.text)
    # print(sel.getall())
    """the following part of the tutorial was wrong and giving les pages than we should get"""
    """
    _page_numbers = sel.xpath('//a[has-class("s-pagination-item")][not(has-class("s-pagination-separator"))]/text()').getall()# this is wronge from the reference
    print(f"page numbers{_page_numbers}")
    """
    last_page = sel.xpath('//span[has-class("s-pagination-disabled")][not(has-class("s-pagination-previous"))]/text()') # When you are on the first page the last page is without hyperlink i.e. no a selector and the previous page of the last do not appear in the span of the pagination list 
    # print(last_page.getall())
    total_pages = int(last_page.getall()[0]) # the wrong solution was max(int(number) for number in _page_numbers)
    # print(f"total_pages are {total_pages}")
    log.info(f"{query}: found {total_pages} pages, scraping them concurrently")

    # now we can scrape remaining pages concurrently 
    # (I commented out the async and the session to avoid the runtime error we will scrape them without awaiting time and without concurrency)
    """
    other_pages = await asyncio.gather(
         *[session.get(f"https://www.amazon.com/s?k={query}&page={page}") for page in range(2, total_pages + 1)]
        )
    """
    other_pages= []
    for page_number in range(2, total_pages+1):
        page = await asyncio.gather(requests.get(f"https://www.amazon.com/s?k={query}&page={page_number}", headers=HEADERS))
        other_pages.extend(page)
    # print(other_pages)
    # print(len(other_pages))
    # parse all search pages for product preview data:
    previews = []
    for response in [first_page, *other_pages]:
        previews.extend(parse_search(response))

    log.info(f"{query}: found total of {len(previews)} product previews")
    return previews

In [11]:
#the asyncio library is to run the requests concurrently and not wait for the first to be done

In [None]:
pip install asyncio 


In [None]:
import asyncio
query= 'Refrigerator'.replace(' ', '+')
search(query)
# asyncio.run(search(query))
await search(query)

In [10]:
import json
import pandas as pd

In [11]:
# write the data to some json or Excel file 
def get_product_search_list(query):
    data = search(query)
        # for item in data:
        #     print(item["asin"])
    i = int(input("Enter the file number four the output: "))
    with open(f'query_results_{i}.json', 'w') as file:
        json.dump(data, file, indent=2)
    # print(json.dumps(data, indent=2))  # this is an alternative to the above line to print the json dictionaries in the run shell

    df = pd.DataFrame(data)
    df.to_excel(f"query_results_{i}.xlsx", index=False)
    

In [12]:
# main scope run the function
query= 'Refrigerator'.replace(' ', '+')
get_product_search_list(query)