In [1]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import random

# ✅ Path to save Excel file
save_path = r"E:\DHP\Group_Project\amazon_product_mobiles_data.xlsx"

# ✅ Shared data storage (processed in batches)
all_data = []
lock = Lock()  # Thread safety

# ✅ User input for page range
start_page = int(input("Enter the starting page number: "))
end_page = int(input("Enter the ending page number: "))

# ✅ Number of threads for multithreading (8 cores)
NUM_THREADS = 8
MAX_RETRIES = 7  # Retry attempts in case of failure
SAVE_INTERVAL = 10  # Save every 10 products

# Rotate User-Agent list to avoid getting blocked
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 Edge/91.0.864.59"
]

def scrape_page(page_number):
    """Scrapes a single Amazon page with retries and returns extracted data."""
    site = f"https://www.amazon.in/s?k=mobiles&crid=2HHJVMGRXCC2I&qid=1744563410&sprefix=mobiles%2Caps%2C783&xpid=k4m8Obu5IevXA&ref=sr_pg_{page_number}"
    
    delay = 3  # Start with a higher delay to avoid rate limits

    headers = {
        "User-Agent": random.choice(USER_AGENTS)
    }

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = requests.get(site, timeout=30, headers=headers)  # ⬆ Increased timeout to avoid read timeouts
            if response.status_code == 503:  # Service Unavailable (Amazon is blocking requests)
                wait_time = delay + random.uniform(1, 3)  # Randomized backoff
                print(f"⚠️ Service unavailable on page {page_number}, retrying in {wait_time:.2f}s (Attempt {attempt}/{MAX_RETRIES})...")
                time.sleep(wait_time)
                delay *= 2  # Exponential backoff
                continue  # Retry request

            if response.status_code != 200:
                print(f"❌ Failed to fetch page {page_number}: {response.status_code}")
                return []

            soup = BeautifulSoup(response.text, "html.parser")
            products = soup.find_all("div", class_="s-main-slot")[0].find_all("div", {"data-component-type": "s-search-result"})

            page_data = []
            for product in products:
                # Extract Product Name using the <span> tag you provided
                title = product.find("span")
                if title:
                    title = title.get_text()  # Get the text inside the <span> tag

                # Extract Price
                price = product.find("span", class_="a-price-whole")
                if price:
                    price = price.get_text()

                # Extract Rating
                rating = product.find("span", class_="a-icon-alt")
                if rating:
                    rating = rating.get_text()

                # Extract Product URL
                link = product.find("a", class_="a-link-normal")
                if link:
                    product_url = "https://www.amazon.in" + link.get("href")

                page_data.append([title, price, rating, product_url])

            print(f"✅ Scraped page {page_number} successfully!")
            return page_data  # Return extracted data

        except requests.exceptions.RequestException as e:
            print(f"⚠️ Error on page {page_number} (attempt {attempt}/{MAX_RETRIES}): {e}")
            time.sleep(delay + random.uniform(1, 3))  # Randomized retry delay
            delay *= 2  # Exponential backoff

    print(f"❌ Skipping page {page_number} after {MAX_RETRIES} failed attempts.")
    return []  # Return empty list if all retries fail

def save_to_excel():
    """Efficiently appends new data to the Excel file in batches."""
    with lock:
        if not all_data:
            return

        df = pd.DataFrame(all_data, columns=["Product Name", "Price", "Rating", "Product URL"])

        if os.path.exists(save_path):
            existing_df = pd.read_excel(save_path)
            df = pd.concat([existing_df, df], ignore_index=True)  # Append new data

        df.to_excel(save_path, index=False)
        print(f"💾 Data saved to: {save_path} (Total rows: {len(df)})")

        # ✅ Clear memory after saving
        all_data.clear()

# ✅ Parallel scraping with ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    future_to_page = {executor.submit(scrape_page, i): i for i in range(start_page, end_page + 1)}

    for i, future in enumerate(as_completed(future_to_page), start=1):
        result = future.result()
        with lock:
            all_data.extend(result)

        # ✅ Save every 10 products
        if i % SAVE_INTERVAL == 0 and all_data:
            save_to_excel()

# ✅ Final save when all pages are done
if all_data:
    save_to_excel()

print("🎉 Data scraping completed successfully!")


✅ Scraped page 8 successfully!
✅ Scraped page 3 successfully!
✅ Scraped page 1 successfully!
✅ Scraped page 6 successfully!
✅ Scraped page 7 successfully!
✅ Scraped page 5 successfully!
✅ Scraped page 4 successfully!
✅ Scraped page 9 successfully!
✅ Scraped page 2 successfully!
✅ Scraped page 10 successfully!
✅ Scraped page 11 successfully!
✅ Scraped page 14 successfully!
✅ Scraped page 12 successfully!
✅ Scraped page 13 successfully!
✅ Scraped page 15 successfully!
✅ Scraped page 16 successfully!
✅ Scraped page 17 successfully!
✅ Scraped page 19 successfully!
✅ Scraped page 18 successfully!
✅ Scraped page 20 successfully!
✅ Scraped page 21 successfully!
💾 Data saved to: E:\DHP\Group_Project\amazon_product_mobiles_data.xlsx (Total rows: 207)
💾 Data saved to: E:\DHP\Group_Project\amazon_product_mobiles_data.xlsx (Total rows: 427)
💾 Data saved to: E:\DHP\Group_Project\amazon_product_mobiles_data.xlsx (Total rows: 449)
🎉 Data scraping completed successfully!
