In [None]:
!pip install beautifulsoup4



In [None]:
!pip install lxml



In [None]:
!pip install requests



In [None]:
from bs4 import BeautifulSoup
import lxml
import requests
from urllib.parse import urljoin
import pandas as pd
import os
import shutil


In [None]:
# Test run on a URL
url = 'https://www.amazon.com/s?k=high+heels&crid=16WJUBN9DN36K&sprefix=high+heel%2Caps%2C112&ref=nb_sb_noss_1'

custom_headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
                  'accept-language':'en-US,en;q=0.9'}

response = requests.get(url, headers=custom_headers)

print(response.status_code)
#503 is a block by Amazon

200


In [None]:
# Test run to scrape ratings and title
soup = BeautifulSoup(response.text, "lxml")

# Scraping ratings
title_element = soup.select_one('#productTitle')
rating_element = soup.select_one('#acrPopover')
rating_text = rating_element.attrs.get('title')

print(rating_text)

rating = rating_text.replace('out of 5 stars', '')
print(rating)


4.0 out of 5 stars
4.0 


### Web Scraping sneakers

In [None]:
# Category page for sneakers
# URL: https://www.amazon.com/s?k=sneakers&ref=nb_sb_noss

visited_urls = set()

def get_product_info(url):
    response = requests.get(url, headers=custom_headers)
    if response.status_code != 200:
        print(f"Error in getting webpage: {url}")
        return None

    soup = BeautifulSoup(response.text, "lxml")
    image_element = soup.select_one("#landingImage")
    image = image_element.attrs.get("src") if image_element else None

    return {
        "image": image
    }


def parse_listing(listing_url, pages_remaining=1):
    global visited_urls
    print(f"Now scraping page: {listing_url}")
    print(f"Pages remaining: {pages_remaining}")
    if pages_remaining <= 0:
        return []  # Stop recursion when limit is reached

    response = requests.get(listing_url, headers=custom_headers)
    print(response.status_code)
    soup_search = BeautifulSoup(response.text, "lxml")
    link_elements = soup_search.select("[data-asin] h2 a")
    page_data = []

    for link in link_elements:
        full_url = urljoin(listing_url, link.attrs.get("href"))
        if full_url not in visited_urls:
            visited_urls.add(full_url)
            print(f"Scraping product from {full_url[:100]}", flush=True)
            product_info = get_product_info(full_url)
            if product_info:
                page_data.append(product_info)

    next_page_el = soup_search.select_one('a.s-pagination-next')
    if next_page_el:
        next_page_url = next_page_el.attrs.get('href')
        next_page_url = urljoin(listing_url, next_page_url)
        print(f'Scraping next page: {next_page_url}', flush=True)
        page_data += parse_listing(next_page_url)

    return page_data


def main():
    data = []
    search_url = "https://www.amazon.com/s?k=sneakers&ref=nb_sb_noss"
    data = parse_listing(search_url, 1)
    df = pd.DataFrame(data)
    df.to_csv("sneakers.csv", index=False)


if __name__ == '__main__':
    main()


Now scraping page: https://www.amazon.com/s?k=sneakers&ref=nb_sb_noss
Pages remaining: 1
200
Scraping product from https://www.amazon.com/Reebok-Womens-Extra-Sneaker-Chalk/dp/B09Z73K77H/ref=sr_1_1?dib=eyJ2IjoiMSJ9.G
Scraping product from https://www.amazon.com/adidas-Womens-Park-Sneaker-White/dp/B0BHPWNFQ8/ref=sr_1_2?dib=eyJ2IjoiMSJ9.G7
Scraping product from https://www.amazon.com/New-Balance-Womens-Sneaker-Nimbus/dp/B093QK8S8R/ref=sr_1_3?dib=eyJ2IjoiMSJ9.G
Scraping product from https://www.amazon.com/Rocket-Dog-Womens-Cheery-Sneaker/dp/B08DXLQBY4/ref=sr_1_4?dib=eyJ2IjoiMSJ9.G7
Scraping product from https://www.amazon.com/Skechers-Womens-DLites-Lace-up-Sneaker/dp/B014GNJS22/ref=sr_1_5?dib=eyJ2IjoiM
Scraping product from https://www.amazon.com/PUMA-Womens-Carina-Sneaker-Silver/dp/B07HJRV1YQ/ref=sr_1_6?dib=eyJ2IjoiMSJ9.G
Scraping product from https://www.amazon.com/Sorel-Womens-Kinetic-Breakthru-Sneaker/dp/B0BNDZWRTB/ref=sr_1_7?dib=eyJ2IjoiM
Scraping product from https://www.amazon.com/S

#### Extracting images of sneakers from URLS

In [None]:
df_sneakers = pd.read_csv("sneakers.csv")
sneakers_urls = df_sneakers ['image'].tolist()

# Create directory
image_dir = "/content/downloaded_images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

for idx, url in enumerate(sneakers_urls, start=1):
    print(f"Downloading image {idx}...")
    try:
        response = requests.get(url)
        response.raise_for_status()

        # Define the path for saving the image
        filepath = os.path.join(image_dir, f"image_{idx}.jpg")

        # Open the file in binary-write mode and save the image
        with open(filepath, "wb") as img_file:
            img_file.write(response.content)
    except requests.RequestException as e:
        print(f"Error downloading {url}: {str(e)}")


Downloading image 1...
Downloading image 2...
Downloading image 3...
Downloading image 4...
Downloading image 5...
Downloading image 6...
Downloading image 7...
Downloading image 8...
Downloading image 9...
Downloading image 10...
Downloading image 11...
Downloading image 12...
Downloading image 13...
Downloading image 14...
Downloading image 15...
Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Downloading image 16...
Downloading image 17...
Downloading image 18...
Downloading image 19...
Downloading image 20...
Downloading image 21...
Downloading image 22...
Downloading image 23...
Downloading image 24...
Downloading image 25...
Downloading image 26...
Downloading image 27...
Downloading image 28...
Downloading image 29...
Downloading image 30...
Downloading image 31...
Downloading image 32...
Downloading image 33...
Downloading image 34...
Downloading image 35...
Downloading image 36...
Downloading image 37...
Downloading image 38...
Down

In [None]:
# Compress the image directory
shutil.make_archive("/content/downloaded_images", 'zip', "/content/downloaded_images")

# Use the file browser in Colab to download the zip file, or use the following:
from google.colab import files
files.download("/content/downloaded_images.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Web Scraping sandles

In [None]:
# Category page for sandles
# URL: https://www.amazon.com/s?k=sandles&crid=1SSSPWTX9WKFQ&sprefix=sandle%2Caps%2C141&ref=nb_sb_noss_2

visited_urls = set()


def get_product_info(url):
    response = requests.get(url, headers=custom_headers)
    if response.status_code != 200:
        print(f"Error in getting webpage: {url}")
        return None

    soup = BeautifulSoup(response.text, "lxml")
    image_element = soup.select_one("#landingImage")
    image = image_element.attrs.get("src") if image_element else None

    return {
        "image": image
    }


def parse_listing(listing_url, pages_remaining=1):
    global visited_urls
    print(f"Now scraping page: {listing_url}")
    print(f"Pages remaining: {pages_remaining}")
    if pages_remaining <= 0:
        return []  # Stop recursion when limit is reached

    response = requests.get(listing_url, headers=custom_headers)
    print(response.status_code)
    soup_search = BeautifulSoup(response.text, "lxml")
    link_elements = soup_search.select("[data-asin] h2 a")
    page_data = []

    for link in link_elements:
        full_url = urljoin(listing_url, link.attrs.get("href"))
        if full_url not in visited_urls:
            visited_urls.add(full_url)
            print(f"Scraping product from {full_url[:100]}", flush=True)
            product_info = get_product_info(full_url)
            if product_info:
                page_data.append(product_info)

    next_page_el = soup_search.select_one('a.s-pagination-next')
    if next_page_el:
        next_page_url = next_page_el.attrs.get('href')
        next_page_url = urljoin(listing_url, next_page_url)
        print(f'Scraping next page: {next_page_url}', flush=True)
        page_data += parse_listing(next_page_url)

    return page_data


def main():
    data = []
    search_url = "https://www.amazon.com/s?k=sandles&crid=1SSSPWTX9WKFQ&sprefix=sandle%2Caps%2C141&ref=nb_sb_noss_2"
    data = parse_listing(search_url, 1)
    df = pd.DataFrame(data)
    df.to_csv("sandles.csv", index=False)


if __name__ == '__main__':
    main()


Now scraping page: https://www.amazon.com/s?k=sandles&crid=1SSSPWTX9WKFQ&sprefix=sandle%2Caps%2C141&ref=nb_sb_noss_2
Pages remaining: 1
200
Scraping product from https://www.amazon.com/Soda-Womens-Nubuck-Syntheticsandals-numeric_8/dp/B07RZ4L311/ref=sr_1_1?crid=1
Scraping product from https://www.amazon.com/CUSHIONAIRE-Womens-Slide-Sandals-Brown/dp/B075PBVR9N/ref=sr_1_2?crid=1SSSPWTX
Scraping product from https://www.amazon.com/CUSHIONAIRE-Womens-Footbed-Sandal-Comfort/dp/B087D8YVJD/ref=sr_1_3?crid=1SSSP
Scraping product from https://www.amazon.com/Amazon-Essentials-Womens-Strappy-Natural/dp/B07FQPZWWW/ref=sr_1_4?crid=1SSSPW
Scraping product from https://www.amazon.com/KuaiLu-Support-Comfortable-Walking-Sandals/dp/B0C5ZRQS2X/ref=sr_1_5?crid=1SSS
Scraping product from https://www.amazon.com/FITORY-Sandals-Fashion-Leather-Slippers/dp/B0BN3HW9YZ/ref=sr_1_6?crid=1SSSPWT
Scraping product from https://www.amazon.com/Rekayla-Elastic-Sandals-Women-Khaki/dp/B07BMQ4RBK/ref=sr_1_7?crid=1SSSPWTX9WK

#### Extracting images from URLs

In [None]:
df_sandles = pd.read_csv("sandles.csv")
sandles_urls = df_sandles['image'].tolist()

# Create directory
image_dir = "/content/sandles_images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

for idx, url in enumerate(sandles_urls, start=1):
    print(f"Downloading image {idx}...")
    try:
        # Make a request to download the image
        response = requests.get(url)
        response.raise_for_status()

        # Define the path for saving the image
        filepath = os.path.join(image_dir, f"image_{idx}.jpg")

        # Open the file in binary-write mode and save the image
        with open(filepath, "wb") as img_file:
            img_file.write(response.content)
    except requests.RequestException as e:
        print(f"Error downloading {url}: {str(e)}")


Downloading image 1...
Downloading image 2...
Downloading image 3...
Downloading image 4...
Downloading image 5...
Downloading image 6...
Downloading image 7...
Downloading image 8...
Downloading image 9...
Downloading image 10...
Downloading image 11...
Downloading image 12...
Downloading image 13...
Downloading image 14...
Downloading image 15...
Downloading image 16...
Downloading image 17...
Downloading image 18...
Downloading image 19...
Downloading image 20...
Downloading image 21...
Downloading image 22...
Downloading image 23...
Downloading image 24...
Downloading image 25...
Downloading image 26...
Downloading image 27...
Downloading image 28...
Downloading image 29...
Downloading image 30...
Downloading image 31...
Downloading image 32...
Downloading image 33...
Downloading image 34...
Downloading image 35...
Downloading image 36...
Downloading image 37...
Downloading image 38...
Downloading image 39...
Downloading image 40...
Downloading image 41...
Downloading image 42...
D

In [None]:
# Compress the image directory
shutil.make_archive("/content/sandles_images", 'zip', "/content/sandles_images")

# Use the file browser in Colab to download the zip file, or use the following:
from google.colab import files
files.download("/content/sandles_images.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Web Scraping heels

In [None]:
# Category page for heels
#URL: https://www.amazon.com/s?k=high+heels&crid=16WJUBN9DN36K&sprefix=high+heel%2Caps%2C112&ref=nb_sb_noss_1

custom_headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
                  'accept-language':'en-US,en;q=0.9'}

visited_urls = set()


def get_product_info(url):
    response = requests.get(url, headers=custom_headers)
    if response.status_code != 200:
        print(f"Error in getting webpage: {url}")
        return None

    soup = BeautifulSoup(response.text, "lxml")
    image_element = soup.select_one("#landingImage")
    image = image_element.attrs.get("src") if image_element else None

    return {
        "image": image
    }


def parse_listing(listing_url, pages_remaining=1):
    global visited_urls
    print(f"Now scraping page: {listing_url}")
    print(f"Pages remaining: {pages_remaining}")
    if pages_remaining <= 0:
        return []  # Stop recursion when limit is reached

    response = requests.get(listing_url, headers=custom_headers)
    print(response.status_code)
    soup_search = BeautifulSoup(response.text, "lxml")
    link_elements = soup_search.select("[data-asin] h2 a")
    page_data = []

    for link in link_elements:
        full_url = urljoin(listing_url, link.attrs.get("href"))
        if full_url not in visited_urls:
            visited_urls.add(full_url)
            print(f"Scraping product from {full_url[:100]}", flush=True)
            product_info = get_product_info(full_url)
            if product_info:
                page_data.append(product_info)

    next_page_el = soup_search.select_one('a.s-pagination-next')
    if next_page_el:
        next_page_url = next_page_el.attrs.get('href')
        next_page_url = urljoin(listing_url, next_page_url)
        print(f'Scraping next page: {next_page_url}', flush=True)
        page_data += parse_listing(next_page_url)

    return page_data


def main():
    data = []
    search_url = "https://www.amazon.com/s?k=high+heels&crid=16WJUBN9DN36K&sprefix=high+heel%2Caps%2C112&ref=nb_sb_noss_1"
    data = parse_listing(search_url, 1)
    df = pd.DataFrame(data)
    df.to_csv("heels.csv", index=False)


if __name__ == '__main__':
    main()


Now scraping page: https://www.amazon.com/s?k=high+heels&crid=16WJUBN9DN36K&sprefix=high+heel%2Caps%2C112&ref=nb_sb_noss_1
Pages remaining: 1
200
Scraping product from https://www.amazon.com/DREAM-PAIRS-Womens-Nubuck-Sandals/dp/B071Y39MNN/ref=sr_1_1?crid=16WJUBN9DN36K
Scraping product from https://www.amazon.com/DREAM-PAIRS-Womens-Platform-Sandals/dp/B07226QLZY/ref=sr_1_2?crid=16WJUBN9DN3
Scraping product from https://www.amazon.com/Fashare-Womens-Pointed-Bowtie-Buckle/dp/B07T6721C3/ref=sr_1_3?crid=16WJUBN9DN
Scraping product from https://aax-us-iad.amazon.com/x/c/RKk5iccpQC6AlDjxpOEsWz8AAAGOv9CTPQEAAAH2AQBvbm9fdHhuX2JpZDIgICBvbm
Scraping product from https://www.amazon.com/DREAM-PAIRS-Hi-Chunk-GOLD-RHINESTONE-HI-CHUNK-1/dp/B0CPJ4X3XL/ref=sr_1_4?crid
Scraping product from https://www.amazon.com/DREAM-PAIRS-Womens-Stiletto-Sandals/dp/B0785L9R34/ref=sr_1_5?crid=16WJUBN9DN3
Scraping product from https://www.amazon.com/DREAM-PAIRS-Platform-Gladiator-SDHS2205W/dp/B09MJVDWJJ/ref=sr_1_6?crid=

#### Extracting images from URLs

In [None]:
df_heels = pd.read_csv("heels.csv")
heels_urls = df_heels['image'].tolist()

# Create directory
image_dir = "/content/heels_images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

for idx, url in enumerate(heels_urls, start=1):
    print(f"Downloading image {idx}...")
    try:
        # Make a request to download the image
        response = requests.get(url)
        response.raise_for_status()

        # Define the path for saving the image
        filepath = os.path.join(image_dir, f"image_{idx}.jpg")

        # Open the file in binary-write mode and save the image
        with open(filepath, "wb") as img_file:
            img_file.write(response.content)
    except requests.RequestException as e:
        print(f"Error downloading {url}: {str(e)}")


Downloading image 1...
Downloading image 2...
Downloading image 3...
Downloading image 4...
Downloading image 5...
Downloading image 6...
Downloading image 7...
Downloading image 8...
Downloading image 9...
Downloading image 10...
Downloading image 11...
Downloading image 12...
Downloading image 13...
Downloading image 14...
Downloading image 15...
Downloading image 16...
Downloading image 17...
Downloading image 18...
Downloading image 19...
Downloading image 20...
Downloading image 21...
Downloading image 22...
Downloading image 23...
Downloading image 24...
Downloading image 25...
Downloading image 26...
Downloading image 27...
Downloading image 28...
Downloading image 29...
Downloading image 30...
Downloading image 31...
Downloading image 32...
Downloading image 33...
Downloading image 34...
Downloading image 35...
Downloading image 36...
Downloading image 37...
Downloading image 38...
Downloading image 39...
Downloading image 40...
Downloading image 41...
Downloading image 42...
D

In [None]:
# Compress the image directory
shutil.make_archive("/content/heels_images", 'zip', "/content/heels_images")

# Use the file browser in Colab to download the zip file, or use the following:
from google.colab import files
files.download("/content/heels_images.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Web Scraping boots

In [None]:
# Category page for boots
# URL: https://www.amazon.com/s?k=boots&crid=1SBRGQ57J9WRF&sprefix=boots%2Caps%2C126&ref=nb_sb_ss_w_hit-vc-lth_boots_k0_1_5


custom_headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
                  'accept-language':'en-US,en;q=0.9'}

visited_urls = set()


def get_product_info(url):
    response = requests.get(url, headers=custom_headers)
    if response.status_code != 200:
        print(f"Error in getting webpage: {url}")
        return None

    soup = BeautifulSoup(response.text, "lxml")
    image_element = soup.select_one("#landingImage")
    image = image_element.attrs.get("src") if image_element else None

    return {
        "image": image
    }


def parse_listing(listing_url, pages_remaining=1):
    global visited_urls
    print(f"Now scraping page: {listing_url}")
    print(f"Pages remaining: {pages_remaining}")
    if pages_remaining <= 0:
        return []  # Stop recursion when limit is reached

    response = requests.get(listing_url, headers=custom_headers)
    print(response.status_code)
    soup_search = BeautifulSoup(response.text, "lxml")
    link_elements = soup_search.select("[data-asin] h2 a")
    page_data = []

    for link in link_elements:
        full_url = urljoin(listing_url, link.attrs.get("href"))
        if full_url not in visited_urls:
            visited_urls.add(full_url)
            print(f"Scraping product from {full_url[:100]}", flush=True)
            product_info = get_product_info(full_url)
            if product_info:
                page_data.append(product_info)

    next_page_el = soup_search.select_one('a.s-pagination-next')
    if next_page_el:
        next_page_url = next_page_el.attrs.get('href')
        next_page_url = urljoin(listing_url, next_page_url)
        print(f'Scraping next page: {next_page_url}', flush=True)
        page_data += parse_listing(next_page_url)

    return page_data


def main():
    data = []
    search_url = "https://www.amazon.com/s?k=boots&crid=1SBRGQ57J9WRF&sprefix=boots%2Caps%2C126&ref=nb_sb_ss_w_hit-vc-lth_boots_k0_1_5"
    data = parse_listing(search_url, 1)
    df = pd.DataFrame(data)
    df.to_csv("boots.csv", index=False)


if __name__ == '__main__':
    main()


Now scraping page: https://www.amazon.com/s?k=boots&crid=1SBRGQ57J9WRF&sprefix=boots%2Caps%2C126&ref=nb_sb_ss_w_hit-vc-lth_boots_k0_1_5
Pages remaining: 1
200
Scraping product from https://www.amazon.com/NORTIV-Military-Tactical-Leather-Motorcycle/dp/B07V8HJF5Y/ref=sr_1_1?crid=1SB
Scraping product from https://www.amazon.com/SOVANYOU-Leather-Platform-Chunky-Heeled/dp/B0BRKNKLRX/ref=sr_1_2?crid=1SBRGQ5
Scraping product from https://www.amazon.com/Soda-FLING-Chunky-Fashion-numeric_7_point_5/dp/B09DR3VCCH/ref=sr_1_3?crid=1SB
Scraping product from https://www.amazon.com/Wolverine-Overpass-Composite-Waterproof-Summer/dp/B01N6I3GXK/ref=sr_1_4?crid=
Scraping product from https://www.amazon.com/Timberland-White-Ledge-Waterproof-Ankle/dp/B002YOMJYU/ref=sr_1_5?crid=1SBRGQ5
Scraping product from https://www.amazon.com/Soda-Firm-Combat-Bootie-Numeric_7_Point_5/dp/B08H5TR6H4/ref=sr_1_6?crid=1SBRG
Scraping product from https://www.amazon.com/Soda-Pilot-Chelsea-Fashion-Numeric_8/dp/B08KHTLSMF/ref=sr_

#### Extracting images from URLS

In [None]:
df_boots = pd.read_csv("boots.csv")
boots_urls = df_boots['image'].tolist()

# Create directory
image_dir = "/content/boots_images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

for idx, url in enumerate(boots_urls, start=1):
    print(f"Downloading image {idx}...")
    try:
        # Make a request to download the image
        response = requests.get(url)
        response.raise_for_status()

        # Define the path for saving the image
        filepath = os.path.join(image_dir, f"image_{idx}.jpg")

        # Open the file in binary-write mode and save the image
        with open(filepath, "wb") as img_file:
            img_file.write(response.content)
    except requests.RequestException as e:
        print(f"Error downloading {url}: {str(e)}")


Downloading image 1...
Downloading image 2...
Downloading image 3...
Downloading image 4...
Downloading image 5...
Downloading image 6...
Downloading image 7...
Downloading image 8...
Downloading image 9...
Downloading image 10...
Downloading image 11...
Downloading image 12...
Downloading image 13...
Downloading image 14...
Downloading image 15...
Downloading image 16...
Downloading image 17...
Downloading image 18...
Downloading image 19...
Downloading image 20...
Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Downloading image 21...
Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Downloading image 22...
Downloading image 23...
Downloading image 24...
Downloading image 25...
Downloading image 26...
Downloading image 27...
Downloading image 28...
Downloading image 29...
Downloading image 30...
Downloading image 31...
Downloading image 32...
Downloading image 33...
Downloading image 34...
Downloa

In [None]:
# Compress the image directory
shutil.make_archive("/content/boots_images", 'zip', "/content/boots_images")

# Use the file browser in Colab to download the zip file, or use the following:
from google.colab import files
files.download("/content/boots_images.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>