In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from random import uniform

# Functions to extract product details
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": "productTitle"}).text.strip()
        return title
    except AttributeError:
        return None

def get_price(soup):
    try:
        price = soup.find("span", attrs={"class": "a-price-whole"}).text.strip()
        return price
    except AttributeError:
        return None

def get_rating(soup):
    try:
        rating = soup.find("span", attrs={"class": "a-icon-alt"}).text.strip()
        return rating
    except AttributeError:
        return None

def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={"id": "acrCustomerReviewText"}).text.strip()
        return review_count
    except AttributeError:
        return None

def get_availability(soup):
    try:
        availability = soup.find("div", attrs={"id": "availability"}).find("span").text.strip()
        return availability
    except AttributeError:
        return "Not Available"

def get_product_url(link):
    return "https://www.amazon.in" + link

if __name__ == '__main__':
    # Add your user agent
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    # Base URL (Without page number)
    BASE_URL = "https://www.amazon.in/s?k=laptops&crid=33BYNY0AIDK93&sprefix=laptops%2Caps%2C587&ref=nb_sb_noss_2"

    # Get the number of pages to scrape
    num_pages = int(input("Enter the number of pages to scrape: "))

    # Data dictionary to store product details
    data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [], "product_url": []}

    for page in range(1, num_pages + 1):
        print(f"Scraping page {page}...")
        URL = f"{BASE_URL}&page={page}"

        # HTTP Request with retry mechanism
        for attempt in range(3):  # Retry up to 3 times
            try:
                webpage = requests.get(URL, headers=HEADERS, timeout=10)
                webpage.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                print(f"Attempt {attempt + 1} failed for {URL}: {e}")
                time.sleep(2)  # Wait before retrying
        else:
            print(f"Failed to fetch page {page} after multiple attempts.")
            continue

        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")

        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})

        # Store the links
        links_list = []

        # Extract valid links
        for link in links:
            href = link.get('href')
            if href and href.startswith("/"):
                links_list.append(href)

        # Loop for extracting product details from each link
        for link in links_list:
            product_url = get_product_url(link)

            # HTTP Request for product page with retry mechanism
            for attempt in range(3):
                try:
                    product_page = requests.get(product_url, headers=HEADERS, timeout=10)
                    product_page.raise_for_status()
                    break
                except requests.exceptions.RequestException as e:
                    print(f"Attempt {attempt + 1} failed for {product_url}: {e}")
                    time.sleep(2)  # Wait before retrying
            else:
                print(f"Failed to fetch product page {product_url} after multiple attempts.")
                continue

            product_soup = BeautifulSoup(product_page.content, "html.parser")

            # Extract and append product details
            data['title'].append(get_title(product_soup))
            data['price'].append(get_price(product_soup))
            data['rating'].append(get_rating(product_soup))
            data['reviews'].append(get_review_count(product_soup))
            data['availability'].append(get_availability(product_soup))
            data['product_url'].append(product_url)

            # Pause to reduce request frequency (randomized delay)
            time.sleep(uniform(1, 3))

    # Create DataFrame and save to CSV
    amazon_df = pd.DataFrame.from_dict(data)
    amazon_df['title'] = amazon_df['title'].replace('', np.nan)
    amazon_df.dropna(subset=['title'], inplace=True)
    amazon_df.to_csv("amazon_laptop_data.csv", header=True, index=False)

    print("Data has been saved to 'amazon_laptop_data.csv'")

Enter the number of pages to scrape: 1
Scraping page 1...
Data has been saved to 'amazon_laptop_data.csv'
