In [1]:
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup
import logging
from typing import List, Dict

  from pandas.core import (


In [2]:
class BusinessDataScraper:
    def __init__(self, base_url: str, max_retries: int = 3):
        """
        Initialize the web scraper with configuration options
        
        Args:
            base_url (str): Base URL to scrape
            max_retries (int): Maximum number of retry attempts for requests
        """
        self.base_url = base_url
        self.max_retries = max_retries
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        logging.basicConfig(level=logging.INFO, 
                            format='%(asctime)s - %(levelname)s: %(message)s')
        self.logger = logging.getLogger(__name__)

    def make_request(self, url: str) -> requests.Response:
        """
        Make a robust web request with retry mechanism
        
        Args:
            url (str): URL to request
        
        Returns:
            requests.Response: Web page response
        """
        for attempt in range(self.max_retries):
            try:
                response = requests.get(url, headers=self.headers, timeout=10)
                response.raise_for_status()
                return response
            except requests.RequestException as e:
                self.logger.warning(f"Request failed (Attempt {attempt + 1}): {e}")
                time.sleep(2 ** attempt)  # Exponential backoff
        
        raise RuntimeError(f"Failed to retrieve {url} after {self.max_retries} attempts")

    def extract_business_data(self, soup: BeautifulSoup) -> List[Dict]:
        """
        Extract structured business data from BeautifulSoup object
        
        Args:
            soup (BeautifulSoup): Parsed HTML content
        
        Returns:
            List[Dict]: Extracted business information
        """
        business_data = []
        company_elements = soup.find_all('div', class_='companyCardWrapper__primaryInformation')
        
        for element in company_elements:
            try:
                company_name = element.find('h2', class_='companyCardWrapper__companyName').text.strip()
                rating = element.find('div', class_='rating_star_container').text.strip()
                domain_location = element.find('span', class_='companyCardWrapper__interLinking').text.strip()
                
                business_data.append({
                    'name': company_name,
                    'rating': rating,
                    'domain_location': domain_location
                })
            except AttributeError as e:
                self.logger.warning(f"Could not extract full data for an element: {e}")
        
        return business_data

    def scrape_data(self) -> pd.DataFrame:
        """
        Main scraping method to collect and process business data
        
        Returns:
            pd.DataFrame: Cleaned and processed business data
        """
        try:
            response = self.make_request(self.base_url)
            soup = BeautifulSoup(response.content, 'lxml')
            
            raw_data = self.extract_business_data(soup)
            df = pd.DataFrame(raw_data)
            
            # Data Cleaning
            df.dropna(subset=['name'], inplace=True)
            df.drop_duplicates(subset=['name'], keep='first', inplace=True)
            
            return df
        
        except Exception as e:
            self.logger.error(f"Scraping failed: {e}")
            return pd.DataFrame()

    def save_data(self, df: pd.DataFrame, filename: str = 'business_data.csv'):
        """
        Save scraped data to CSV with anonymization
        
        Args:
            df (pd.DataFrame): DataFrame to save
            filename (str): Output filename
        """
        # Optional: Add anonymization steps
        df.to_csv(filename, index=False, encoding='utf-8')
        self.logger.info(f"Data saved to {filename}")



In [3]:
def main():
    scraper = BusinessDataScraper('https://www.ambitionbox.com/list-of-companies')
    business_data = scraper.scrape_data()
    scraper.save_data(business_data)

In [4]:
if __name__ == "__main__":
    main()

2024-12-08 11:08:04,945 - INFO: Data saved to business_data.csv
