In [10]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Disable SSL certificate warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = "https://www.rehabmart.com"
CATEGORY_URL = "https://www.rehabmart.com/category/abdominal_supports.htm"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0"
]

# ScraperAPI Key for proxy
SCRAPERAPI_KEY = "a7939ea9627f8a0322ded8ad05bbe245"

In [11]:
def rotate_user_agents():
    """Randomly select a User-Agent from the list."""
    return random.choice(USER_AGENTS)

In [12]:
def get_proxy():
    proxies = {
        "http": f"http://scraperapi:{SCRAPERAPI_KEY}@proxy-server.scraperapi.com:8001",
        "https": f"http://scraperapi:{SCRAPERAPI_KEY}@proxy-server.scraperapi.com:8001"
    }
    return proxies

session = requests.Session()
retries = Retry(total=5, backoff_factor=2, status_forcelist=[500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))

In [13]:
def fetch_url(url):
    """Fetch a URL and return the response text."""
    headers = {"User-Agent": rotate_user_agents()}
    try:
        response = session.get(
            url, 
            headers=headers, 
            proxies=get_proxy(), 
            timeout=15, 
            verify=False
        )
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [14]:
def get_product_urls(category_url):
    """Fetch all product URLs from a given category, including pagination."""
    print(f"Fetching products from: {category_url}")
    product_urls = []
    
    while category_url:
        html = fetch_url(category_url)
        if not html:
            break

        soup = BeautifulSoup(html, "html.parser")

        # Check if the product grid exists (using id="categorygrid")
        product_grid = soup.find("div", id="categorygrid")
        if product_grid:
            product_boxes = product_grid.find_all("div", class_="col-xs-6 col-sm-3 col-md-3 boxed productTD")
            for box in product_boxes:
                product_link = box.find("div", class_="findproducttitle producttitle").find("a")["href"]
                # Ensure the URL is correctly formed
                if product_link.startswith("/"):
                    product_urls.append(f"{BASE_URL}{product_link}")
                else:
                    product_urls.append(product_link)

        # Check for the second pagination div
        pagination = soup.find_all("div", class_="pagination")
        if len(pagination) > 1:  # Ensure there are multiple pagination divs
            next_page = pagination[1].find("a", string="Next")
            if next_page:
                category_url = f"{BASE_URL}{next_page['href']}"  # Navigate to next page
            else:
                break  # No next page, stop the loop
        else:
            break  # No pagination div found, stop the loop

    return product_urls

In [15]:
def extract_product_info(url):
    """Extract product details from a product URL."""
    html = fetch_url(url)
    if not html:
        return None

    soup = BeautifulSoup(html, "html.parser")
    try:
        name = soup.find("h1", itemprop=True).text.strip()
    except AttributeError:
        name = None

    try:
        breadcrumb_list = soup.find("ol", class_="breadcrumb hidden-xsx").find_all("li")
        category = breadcrumb_list[1].find("span", itemprop="name").text.strip() if len(breadcrumb_list) > 1 else None
    except AttributeError:
        category = None

    try:
        list_price = soup.find("s", style="font-size:1.2em;").text.strip()
    except AttributeError:
        list_price = None

    try:
        special_price = soup.find("div", class_="text-danger").find("b").find("span", class_="text-danger price").text.strip()
    except AttributeError:
        special_price = None

    try:
        description = soup.find("div", itemprop="description").text.strip()
        description = re.sub(r"^(Product Overview:\s*)", "", description, flags=re.I).strip()
    except AttributeError:
        description = None

    try:
        rating_text = soup.find("div", class_="starreview-container").find("strong").text.strip()
        rating_match = re.search(r"(\d(?:\.\d)?) of (\d) stars", rating_text)
        rating = f"{rating_match.group(1)} out of {rating_match.group(2)} stars" if rating_match else None
    except AttributeError:
        rating = None

    try:
        img_url = soup.find("div", class_="gallery").find("img", class_="img-responsive")["src"]
    except AttributeError:
        img_url = None

    return {
        "name": name,
        "category": category,
        "list_price": list_price,
        "special_price": special_price,
        "description": description,
        "rating": rating,
        "img_url": img_url,
        "product_url": url
    }

In [16]:
def scrape_products_from_category(category_url):
    """Scrape all products from a single category and return all product data."""
    product_urls = get_product_urls(category_url)
    all_product_data = []

    for url in tqdm(product_urls, desc="Fetching Product Details"):
        product_info = extract_product_info(url)
        if product_info:
            all_product_data.append(product_info)

    return all_product_data

In [17]:
def get_category_urls():
    """Get all category URLs from the RehabMart categories page."""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get("https://www.rehabmart.com/all-categories.asp")

    # Wait for elements to load
    driver.implicitly_wait(5)

    category_urls = set()

    # Locate all category list items
    category_elements = driver.find_elements(By.CSS_SELECTOR, "div.panel-body.cats ul.ul-mobile.cat-list li a")
    
    for category in category_elements:
        url = category.get_attribute("href")
        if url:
            category_urls.add(url)

    driver.quit()
    return list(category_urls)

# Get all category URLs
category_urls = get_category_urls()

# Scrape product details from each category and collect all the product data
all_product_data = []
for category_url in category_urls:
    category_data = scrape_products_from_category(category_url)
    all_product_data.extend(category_data)

# Save all product details to a single CSV file
df = pd.DataFrame(all_product_data)
df.to_csv("rehabmart.csv", index=False)
print("Data saved to rehabmart.csv")

Fetching products from: https://www.rehabmart.com/category/urology_table.htm


Fetching Product Details: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]


Fetching products from: https://www.rehabmart.com/category/skin_cleanser.htm


Fetching Product Details:  42%|████▏     | 8/19 [01:20<01:50, 10.08s/it]


KeyboardInterrupt: 