In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Bright Data Direct API Access
BRIGHTDATA_API_KEY = "83da928bbad419ac47ff05de6107f869ed948b64d60553e3ffca02eb4bad62ca"
ZONE_NAME = "web_unlocker_amazonv1"

SELLER_ID = "A3HOUT2SFZ52HU"
BASE_URL = f"https://www.amazon.com/s?i=merchant-items&me={SELLER_ID}&marketplaceID=ATVPDKIKX0DER&page={{}}"
HEADERS = {"User-Agent": "Mozilla/5.0"}

request_count = 0
failed_pages = []
all_asins = []


def fetch_html_from_brightdata(target_url):
    global request_count
    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "zone": ZONE_NAME,
        "url": target_url,
        "format": "raw"
    }
    try:
        res = requests.post("https://api.brightdata.com/request", json=data, headers=headers, timeout=30)
        request_count += 1
        if res.status_code == 200:
            return res.text
    except Exception:
        pass
    return None


def extract_product_details(asin):
    product_url = f"https://www.amazon.com/dp/{asin}"
    html = fetch_html_from_brightdata(product_url)
    if not html:
        return ["", asin, "", "", product_url, 0, "0", "", ""]

    soup = BeautifulSoup(html, "html.parser")

    name = soup.select_one("h1 span")
    name = name.text.strip() if name else ""

    rating = soup.select_one("span.a-icon-alt")
    rating = rating.text.strip() if rating else ""

    price_whole = soup.select_one("span.a-price-whole")
    price_frac = soup.select_one("span.a-price-fraction")
    price = f"{price_whole.text.strip()}.{price_frac.text.strip()}" if price_whole and price_frac else ""

    image_urls = set()
    for img in soup.select("#altImages img"):
        url = img.get("src") or img.get("data-src") or img.get("data-image-src")
        if url:
            image_urls.add(url)
    main_image = soup.select_one("#landingImage")
    if main_image:
        url = main_image.get("src") or main_image.get("data-old-hires")
        if url:
            image_urls.add(url)

    review_text = soup.select_one("#acrCustomerReviewText")
    review_count = review_text.text.strip().split()[0].replace(",", "") if review_text else "0"

    breadcrumbs = " > ".join(
        [li.text.strip() for li in soup.select("ul.a-unordered-list.a-horizontal.a-size-small li a")]
    )

    bsr = ""
    detail_bullets = soup.select("#productDetails_detailBullets_sections1 tr")
    for row in detail_bullets:
        if "Best Sellers Rank" in row.text:
            bsr = row.select_one("td").text.strip().split("(")[0].strip()
            break

    if not bsr:
        bullets = soup.select("#detailBulletsWrapper_feature_div li")
        for li in bullets:
            if "Best Sellers Rank" in li.text:
                bsr = li.text.strip().split(":")[-1].split("(")[0].strip()
                break

    if not bsr:
        bestseller_link = soup.select_one('#detailBulletsWrapper_feature_div a[href*="bestsellers"]')
        if bestseller_link and bestseller_link.previous_sibling:
            bsr_text = bestseller_link.previous_sibling.strip().split()[0]
            bsr = bsr_text

    return [name, asin, rating, price, product_url, len(image_urls), review_count, breadcrumbs, bsr]


# Phase 1: Sequentially collect ASINs from each page
page = 1
while True:
    url = BASE_URL.format(page)
    html = fetch_html_from_brightdata(url)
    if not html:
        print(f"❌ Failed to load page {page}")
        failed_pages.append(page)
        break

    soup = BeautifulSoup(html, "html.parser")
    product_divs = soup.select("div.s-main-slot div[data-asin]:not([data-asin=''])")
    page_asins = [div.get("data-asin", "") for div in product_divs if div.get("data-asin")]

    if not page_asins:
        print(f"🛑 No products found on page {page}, stopping.")
        break

    all_asins.extend(page_asins)
    print(f"📄 Page {page} | Found {len(page_asins)} products")

    next_disabled = bool(soup.select_one("ul.a-pagination li.a-disabled.a-last"))
    if next_disabled:
        print("🛑 'Next' button disabled. Last page reached.")
        break

    page += 1

# Phase 2: Scrape product details in parallel
with open("products_with_review_counts.csv", "w", newline='', encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        "Name", "ASIN", "Rating", "Price", "ProductURL",
        "ImageCount", "ReviewCount", "Breadcrumbs", "BestSellerRank"
    ])

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_product_details, asin) for asin in all_asins]
        for future in as_completed(futures):
            row = future.result()
            writer.writerow(row)

print("🎉 Scraping completed! File saved as 'products_with_review_counts.csv'")
print(f"📊 Total requests made via Bright Data API: {request_count}")
print(f"🧾 Total products scraped: {len(all_asins)}")

if failed_pages:
    print(f"⚠️ Pages failed after retries: {failed_pages}")


In [None]:
BRIGHTDATA_API_KEY = "d0e4a478-94a7-4f42-b4f5-7b013c2a5e62"
ZONE_NAME = "web_unlocker1"

input_df = pd.read_excel("sample_ASIN.xlsx")
asin_column = input_df.get("asin", pd.Series())

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Bright Data Direct API Access
BRIGHTDATA_API_KEY = "d0e4a478-94a7-4f42-b4f5-7b013c2a5e62"
ZONE_NAME = "web_unlocker1"

HEADERS = {"User-Agent": "Mozilla/5.0"}

request_count = 0


def fetch_html_from_brightdata(target_url):
    global request_count
    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "zone": ZONE_NAME,
        "url": target_url,
        "format": "raw"
    }
    try:
        res = requests.post("https://api.brightdata.com/request", json=data, headers=headers, timeout=30)
        request_count += 1
        if res.status_code == 200:
            return res.text
    except Exception as e:
        print(f"Error fetching HTML for {target_url}: {e}")
    return None


def extract_product_details(asin):
    product_url = f"https://www.amazon.com/dp/{asin}"
    html = fetch_html_from_brightdata(product_url)
    if not html:
        return ["", asin, "No rating yet", "", product_url, 0, "No reviews yet", "", ""]

    soup = BeautifulSoup(html, "html.parser")

    # Product Name
    name = soup.select_one("span#productTitle")
    name = name.text.strip() if name else ""

    # Rating
    rating = "No rating yet"
    rating_el = soup.select_one("span[data-hook='rating-out-of-text']")
    if rating_el:
        rating = rating_el.text.strip()

    # Price
    price = ""
    price_el = soup.select_one("div.a-section.a-spacing-none.aok-align-center.aok-relative span.aok-offscreen")
    if price_el:
        price = price_el.text.strip()

    # Images
    image_urls = set()
    for img in soup.select("#altImages img"):
        url = img.get("src") or img.get("data-src") or img.get("data-image-src")
        if url:
            image_urls.add(url)
    main_image = soup.select_one("#landingImage")
    if main_image:
        url = main_image.get("src") or main_image.get("data-old-hires")
        if url:
            image_urls.add(url)

    # Review Count
    review_count = "No reviews yet"
    review_el = soup.select_one("span[data-hook='total-review-count']")
    if review_el:
        review_count = review_el.text.strip().split()[0].replace(",", "")

    # Breadcrumbs
    breadcrumbs = " > ".join([
        a.text.strip() for a in soup.select("#wayfinding-breadcrumbs_feature_div ul.a-unordered-list li a")
    ])

    # Best Seller Rank
    bsr = ""
    for section in ["#productDetails_detailBullets_sections1 tr", "#detailBulletsWrapper_feature_div"]:
        block = soup.select(section)
        for tag in block:
            if "Best Sellers Rank" in tag.text:
                text = tag.get_text()
                match = re.search(r"#([\d,]+)", text)
                if match:
                    bsr = match.group(1)
                    break
        if bsr:
            break

    return [name, asin, rating, price, product_url, len(image_urls), review_count, breadcrumbs, bsr]


# Read ASINs from Excel and validate
input_df = pd.read_excel("sample_ASIN.xlsx")
asin_column = input_df.get("asin", pd.Series())
asins_raw = asin_column.dropna().astype(str).unique().tolist()

valid_asins = []
for asin in asins_raw:
    asin = asin.strip().upper()
    if re.fullmatch(r"[A-Z0-9]{10}", asin):
        valid_asins.append(asin)

# Limit to first 20 ASINs for testing
valid_asins = valid_asins[:20]

print(f"📥 Loaded {len(valid_asins)} valid ASINs from Excel input")

# Scrape products in parallel
with open("products_with_review_counts.csv", "w", newline='', encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        "Name", "ASIN", "Rating", "Price", "ProductURL",
        "ImageCount", "ReviewCount", "Breadcrumbs", "BestSellerRank"
    ])

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_product_details, asin) for asin in valid_asins]
        for future in as_completed(futures):
            row = future.result()
            writer.writerow(row)

print("🎉 Scraping completed! File saved as 'products_with_review_counts.csv'")
print(f"📊 Total requests made via Bright Data API: {request_count}")
print(f"🧾 Total products scraped: {len(valid_asins)}")

📥 Loaded 20 valid ASINs from Excel input
🎉 Scraping completed! File saved as 'products_with_review_counts.csv'
📊 Total requests made via Bright Data API: 20
🧾 Total products scraped: 20


In [None]:
BRIGHTDATA_API_KEY = "d0e4a478-94a7-4f42-b4f5-7b013c2a5e62"
ZONE_NAME = "web_unlocker1"

input_df = pd.read_excel("sample_ASIN.xlsx")
asin_column = input_df.get("asin", pd.Series())

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Bright Data Direct API Access
BRIGHTDATA_API_KEY = "d0e4a478-94a7-4f42-b4f5-7b013c2a5e62"
ZONE_NAME = "web_unlocker1"

HEADERS = {"User-Agent": "Mozilla/5.0"}

request_count = 0

def fetch_html_from_brightdata(target_url):
    global request_count
    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "zone": ZONE_NAME,
        "url": target_url,
        "format": "raw"
    }
    try:
        res = requests.post("https://api.brightdata.com/request", json=data, headers=headers, timeout=30)
        request_count += 1
        if res.status_code == 200:
            return res.text
    except Exception as e:
        print(f"Error fetching HTML for {target_url}: {e}")
    return None


def extract_product_details(asin):
    product_url = f"https://www.amazon.com/dp/{asin}"
    html = fetch_html_from_brightdata(product_url)
    if not html:
        return ["", asin, "No rating yet", "", product_url, 0, "No reviews yet", "", "", "Not found"]

    soup = BeautifulSoup(html, "html.parser")

    # Product Name
    name = soup.select_one("span#productTitle")
    name = name.text.strip() if name else ""

    # Rating
    rating = "No rating yet"
    rating_el = soup.select_one("span[data-hook='rating-out-of-text']")
    if rating_el:
        rating = rating_el.text.strip()

    # Price
    price = ""
    price_el = soup.select_one("div.a-section.a-spacing-none.aok-align-center.aok-relative span.aok-offscreen")
    if price_el:
        price = price_el.text.strip()

    # Images
    image_urls = set()
    for img in soup.select("#altImages img"):
        url = img.get("src") or img.get("data-src") or img.get("data-image-src")
        if url:
            image_urls.add(url)
    main_image = soup.select_one("#landingImage")
    if main_image:
        url = main_image.get("src") or main_image.get("data-old-hires")
        if url:
            image_urls.add(url)

    # Review Count
    review_count = "No reviews yet"
    review_el = soup.select_one("span[data-hook='total-review-count']")
    if review_el:
        review_count = review_el.text.strip().split()[0].replace(",", "")

    # Breadcrumbs
    breadcrumbs = " > ".join([
        a.text.strip() for a in soup.select("#wayfinding-breadcrumbs_feature_div ul.a-unordered-list li a")
    ])

    # Best Seller Rank
    bsr = ""
    for section in ["#productDetails_detailBullets_sections1 tr", "#detailBulletsWrapper_feature_div"]:
        block = soup.select(section)
        for tag in block:
            if "Best Sellers Rank" in tag.text:
                text = tag.get_text()
                match = re.search(r"#([\d,]+)", text)
                if match:
                    bsr = match.group(1)
                    break
        if bsr:
            break

    # Make Sure This Fits presence check
    make_sure_fits = "No"
    if soup.select_one("#automotive-pf-primary-view-default-make-sure-this-fits"):
        make_sure_fits = "Yes"

    return [name, asin, rating, price, product_url, len(image_urls), review_count, breadcrumbs, bsr, make_sure_fits]


# Read ASINs from Excel and validate
input_df = pd.read_excel("sample_ASIN.xlsx")
asin_column = input_df.get("asin", pd.Series())
asins_raw = asin_column.dropna().astype(str).unique().tolist()

valid_asins = []
for asin in asins_raw:
    asin = asin.strip().upper()
    if re.fullmatch(r"[A-Z0-9]{10}", asin):
        valid_asins.append(asin)

# Limit to first 20 ASINs for testing
#valid_asins = valid_asins[:20]

print(f"📥 Loaded {len(valid_asins)} valid ASINs from Excel input")

# Scrape products in parallel
with open("products_with_review_counts.csv", "w", newline='', encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        "Name", "ASIN", "Rating", "Price", "ProductURL",
        "ImageCount", "ReviewCount", "Breadcrumbs", "BestSellerRank", "MakeSureFits"
    ])

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_product_details, asin) for asin in valid_asins]
        for future in as_completed(futures):
            row = future.result()
            writer.writerow(row)

print("🎉 Scraping completed! File saved as 'products_with_review_counts.csv'")
print(f"📊 Total requests made via Bright Data API: {request_count}")
print(f"🧾 Total products scraped: {len(valid_asins)}")

📥 Loaded 432 valid ASINs from Excel input
🎉 Scraping completed! File saved as 'products_with_review_counts.csv'
📊 Total requests made via Bright Data API: 432
🧾 Total products scraped: 432
