In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import numpy as np
# Function to extract Product Name
def get_name(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        name = title_value.strip()

    except AttributeError:
        name = ""

    return name

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'a-price aok-align-center reinventPricePriceToPayMargin priceToPay'}).find("span",attrs={"class":'a-offscreen' }).text
        
    except AttributeError:
        price=""

    return price
# Function to extract Product Rating
def get_rating(soup):
    try:
        rating=new_soup.find("span",attrs={"class":'a-icon-alt'}).text
        #rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()

    except AttributeError:
        rating=""

    return rating

# Function to extract Number of Ratings
def get_rating_count(soup):
    try:
        s = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
        s_list=s.split(" ")
        rating_count=s_list[0]
        

    except AttributeError:
        rating_count = 0	

    return rating_count
#Function to extract Number of total Customer Reviews
def get_review_count(soup):
    try:
        Headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44', 'Accept-Language': 'en-US, en;q=0.5'})
        cr_link="https://www.amazon.in"+new_soup.find("a",attrs={"class":'a-link-emphasis a-text-bold'}).get('href')
        nnew_webpage = requests.get(cr_link, headers=Headers)
        nnew_soup=BeautifulSoup(nnew_webpage.content, "html.parser")
        s=nnew_soup.find("div",attrs={"class":'a-row a-spacing-base a-size-base'}).string.split()
        review_count=s[-3]
        
    except AttributeError:
        review_count = 0	

    return review_count
#Function to extract Description
def get_description(soup):
    try:
        s=soup.find("div",attrs={"id":'feature-bullets'}).find("ul",attrs={"class":"a-unordered-list a-vertical a-spacing-mini"}).find_all('li')
        description=""
        for i in s:
            d=i.find("span",attrs={"class":'a-list-item'}).text
            description+=d

    except AttributeError:
        description=""

    return description
#Function to extract Manufacturer and ASIN
def get_manufacturer_and_asin(soup):
    try:
        manufacturer=""
        ASIN=""
        s=soup.find("div",attrs={"id":'detailBullets_feature_div'}).find("ul",attrs={"class":'a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list'}).find_all('li')
        manufacturer_flag=True
        asin_flag=True
        for i in s:
            new_soup=i.find("span",attrs={"class":'a-list-item'}).find("span",attrs={"class":'a-text-bold'}).text.split()
            if(manufacturer_flag and new_soup[0]=='Manufacturer'):
                manf=i.find("span",attrs={"class":'a-list-item'}).find_all("span")
                manufacturer=manf[-1].text
                manufacturer_flag=False
            if(asin_flag and new_soup[0]=='ASIN'):
                asin=i.find("span",attrs={"class":'a-list-item'}).find_all("span")
                ASIN=asin[-1].text
                asin_flag=False
    except AttributeError:
        manufacturer=""
        ASIN=""

    return manufacturer,ASIN
#Function to extract the Product Description
def get_product_description(soup):
    try:
        product_description = soup.find("div", attrs={'id':'productDescription'}).find('p').text.strip()
    except AttributeError:
        product_description=""	

    return product_description

In [2]:
#Adding User Agent of Browser to Headers
Headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44', 'Accept-Language': 'en-US, en;q=0.5'})

#URL of the page to scrap data
url = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"
page = 1

# Items requires to scrap are added to dictionary as keys
d = {"Product URL":[],"Product Name":[], "Product Price":[], "Rating":[], "Number of ratings":[], "Number of reviews":[],"Description":[],"ASIN":[],"Product Description":[],"Manufacturer":[]}
#Looping through each page upto 20 pages
while page <= 20:
    print(f"Scraping page {page}")
    response = requests.get(url, headers=Headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    products = soup.find_all('div', {'data-component-type': 's-search-result'})

    # Extracting required data from each product url
    for product in products:
        product_url='https://www.amazon.in' + product.find('a', {'class': 'a-link-normal s-no-outline'})['href']
        d["Product URL"].append(product_url)
        new_webpage = requests.get(product_url, headers=Headers)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
        # PART 1
        d['Product Name'].append(get_name(new_soup))
        d['Product Price'].append(get_price(new_soup))
        d['Rating'].append(get_rating(new_soup))
        d['Number of ratings'].append(get_rating_count(new_soup))
        d["Number of reviews"].append(get_review_count(new_soup))
        #PART2
        manufacturer,asin=get_manufacturer_and_asin(new_soup)
        d['Description'].append(get_description(new_soup))
        d['ASIN'].append(asin)
        d['Product Description'].append(get_product_description(new_soup))
        d['Manufacturer'].append(manufacturer)
        time.sleep(2)
    # To get next page URL
    next_page = soup.find('a', {'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
    if next_page is None:
        print("No next page")
        break
    url = 'https://www.amazon.in' + next_page['href']
    page += 1
    time.sleep(2) # Adding delay to avoid getting blocked by Amazon

# Creating a dataframe and export to csv
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['Product Name'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['Product Name'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)
amazon_df

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
No next page


Unnamed: 0,Product URL,Product Name,Product Price,Rating,Number of ratings,Number of reviews,Description,ASIN,Product Description,Manufacturer
0,https://www.amazon.in/American-Tourister-AMT-S...,American Tourister 32 Ltrs Black Casual Backpa...,"₹1,199",4.1 out of 5 stars,52288,7878,"Laptop Compatibility: No, Strap Type: Adjusta...",,,
1,https://www.amazon.in/Skybags-Brat-Black-Casua...,Skybags Brat Black 46 Cms Casual Backpack,₹659,4.1 out of 5 stars,3770,364,Combination of functional & safety features i...,B08Z1HHHTD,Ideal for a college student who does not carry...,VIP Industries Ltd
2,https://www.amazon.in/Wesley-Milestone-Waterpr...,Wesley Milestone 2.0 Casual Waterproof Laptop ...,₹565,4.3 out of 5 stars,9794,1192,30L Capacity: The Backpack has a padded lapto...,,,
3,https://www.amazon.in/Lavie-Sport-Duffle-Lugga...,Lavie Sport Lino Large Size 63 cms Wheel Duffl...,₹949,3.9 out of 5 stars,6297,673,The Lino wheel Duffle Bag from Lavie Sport is...,B097RJ22Q3,This unisex stylish travel Bag is made of prem...,Bagzone Lifestyle Pvt Ltd
4,https://www.amazon.in/ADISA-Laptop-Backpack-Of...,ADISA 15.6 inch Laptop Backpack Office Bag Col...,₹499,3.8 out of 5 stars,566,71,Material: Water Resistant Light-Weight Polyes...,B09TPX22NF,,ADISA
...,...,...,...,...,...,...,...,...,...,...
313,https://www.amazon.in/Half-Moon-Waterproof-Lap...,Half Moon 45L Hammer Waterproof Laptop Travell...,"₹1,299",4.1 out of 5 stars,1013,103,ROOMY ENOUGH : This Rucksack bag is lightweig...,B09XLNNQC1,This Rucksack backpack ensures complete safety...,AB Amazing Bags
314,https://www.amazon.in/Lavie-Sport-Business-Bac...,Lavie Sport Mode Gear 30L Business Laptop Back...,"₹1,409",4.1 out of 5 stars,9,8,A dependable laptop backpack! A Lavie Sport M...,B0BQMTLDGY,,Bagzone Lifestyle Pvt Ltd
315,https://www.amazon.in/Wildcraft-Laptop-Trident...,wildcraft Nylon 40 ltrs Black Laptop Bag (Trid...,"₹2,999",4.4 out of 5 stars,564,138,"Outer Material: Nylon, Color: Black Water R...",B019DWD8SQ,A stylish and trendy laptop backpacks from the...,Wildcraft
316,https://www.amazon.in/Wildcraft-Laptop-Trident...,wildcraft Nylon 40 ltrs Black Laptop Bag (Trid...,"₹2,999",4.4 out of 5 stars,564,138,"Outer Material: Nylon, Color: Black Water R...",B019DWD8SQ,A stylish and trendy laptop backpacks from the...,Wildcraft
