In [None]:
# Import necessary libraries
import requests                    # For sending HTTP requests to web pages
from bs4 import BeautifulSoup      # For parsing HTML content
import pandas as pd                # For storing data in tabular format (DataFrame)
import random                      # For selecting random user-agents to avoid being blocked
import time                        # (Optional) Can be used to slow down requests to avoid detection

# List of different user-agents (helps in mimicking browser behavior and reduces the chance of getting blocked)
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
]

# Randomly select a user-agent for the request
headers = {
    "User-Agent": random.choice(user_agents),
    "Accept-Language": "en-Us,en;q=0.9"
}

# Lists to store scraped data
product_name = []
product_price = []
product_rating = []

# URL of the Amazon product listing page (here: kitchen category, page 1)
url = f"https://www.amazon.in/s?i=kitchen&bbn=81107433031&rh=n%3A81107433031%2Cp_85%3A10440599031&page=1&_encoding=UTF8&content-id=amzn1.sym.58c90a12-100b-4a2f-8e15-7c06f1abe2be&pd_rd_r=eb705f4e-d34b-456d-a496-b52f6602d46b&pd_rd_w=hwFSy&pd_rd_wg=MVPlH&qid=1745483311&xpid=thnXwLcmHk4-q&ref=sr_pg_2"

# Send GET request to fetch HTML content
response = requests.get(url, headers=headers)

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all product links using their CSS class
links = soup.find_all("a", attrs={"class": "a-link-normal s-line-clamp-4 s-link-style a-text-normal"})

# Loop through each product link
for i, link in enumerate(links):
    href = link.get('href')  # Extract relative product URL
    product_link = "https://www.amazon.in" + href  # Form complete product URL

    # Send GET request to product detail page
    new_url = requests.get(product_link, headers=headers)
    new_soup = BeautifulSoup(new_url.content, "html.parser")

    # Extract product name, price, and rating using their unique tags/IDs/classes
    name = new_soup.find("span", attrs={"id": "productTitle"})
    price = new_soup.find("span", attrs={"class": "a-price-whole"})
    rating = new_soup.find("span", attrs={"class": "a-icon-alt"})

    # Append data to lists (with fallback "NA" if data is missing)
    product_name.append(name.text.strip() if name else "NA")
    product_price.append(price.text.strip().rstrip('.') if price else "NA")
    product_rating.append(rating.text.strip() if rating else "NA")

# Print summary of total records scraped
print(f"Total number of records:\n Product Names: {len(product_name)}\n Prices: {len(product_price)}\n Ratings: {len(product_rating)}")

# Create a DataFrame using the scraped data
data = {
    "product_name": product_name,
    "price": product_price,
    "ratings": product_rating
}
df = pd.DataFrame(data)

# Save the data to a CSV file
df.to_csv("amazon_products_single_page.csv", index=False)

print("Scraping completed. Data saved to 'amazon_products_single_page.csv'")

In [None]:
# Display first few rows (optional for debugging)
df.head()