In [14]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urljoin, urlparse
from datetime import datetime

class CompanyProfiler:
    def __init__(self, base_url):
        self.base_url = base_url
        self.domain = urlparse(base_url).netloc
        self.data = {
            "identity": {"company_name": None, "website": base_url, "tagline": ""},
            "business_summary": {"summary": "", "offerings": [], "segments": []},
            "evidence": {"social_links": [], "priority_pages": [], "signals": []},
            "contact_details": {"emails": [], "phones": [], "address": "Not found", "contact_url": None},
            "metadata": {"scraped_at": datetime.now().isoformat(), "pages_crawled": []}
        }
        self.visited = set()

    def get_soup(self, url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            res = requests.get(url, headers=headers, timeout=10)
            self.data["metadata"]["pages_crawled"].append(url)
            return BeautifulSoup(res.text, 'html.parser'), res.text
        except:
            return None, ""

    def extract_contact_info(self, html_text):
        # Email Regex
        emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', html_text)
        self.data["contact_details"]["emails"].extend(emails)
        
        # Phone Regex (Handles international and local formats)
        phones = re.findall(r'(\+?\d{1,3}[-.\s]?\(?\d{2,3}\)?[-.\s]?\d{3,4}[-.\s]?\d{4})', html_text)
        self.data["contact_details"]["phones"].extend(phones)

    def scrape(self):
        # 1. Start with Homepage to find navigation
        soup, html = self.get_soup(self.base_url)
        if not soup: return "Error: Could not reach site."

        self.data["identity"]["company_name"] = soup.title.string.split('|')[0].strip() if soup.title else self.domain
        self.extract_contact_info(html)

        # 2. Identify Priority URLs (Contact, About, Products)
        priority_urls = {}
        for a in soup.find_all('a', href=True):
            link = urljoin(self.base_url, a['href'])
            text = a.text.lower().strip()
            
            if self.domain in link:
                if 'contact' in text or 'contact' in link: priority_urls['contact'] = link
                if 'about' in text or 'about' in link: priority_urls['about'] = link
                if 'product' in text or 'solution' in text: priority_urls['products'] = link

        # 3. Explicitly visit the Contact Page if found
        if 'contact' in priority_urls:
            self.data["contact_details"]["contact_url"] = priority_urls['contact']
            c_soup, c_html = self.get_soup(priority_urls['contact'])
            if c_html:
                self.extract_contact_info(c_html)
                # Simple Address Detection (look for zip code patterns)
                address_match = re.search(r'(\d{1,6},\s?[A-Za-z\s]+,\s?[A-Za-z\s]+\s\d{5,6})', c_html)
                if address_match: self.data["contact_details"]["address"] = address_match.group(0)

        # 4. Cleanup and Deduplicate
        self.data["contact_details"]["emails"] = list(set(self.data["contact_details"]["emails"]))
        self.data["contact_details"]["phones"] = list(set([p for p in self.data["contact_details"]["phones"] if len(p) > 9]))
        
        return self.data

# --- EXECUTION ---
profiler = CompanyProfiler("https://velbiom.com")
profile = profiler.scrape()

# Show Results Cleanly
print(json.dumps(profile, indent=2))

{
  "identity": {
    "company_name": "Velbiom- For a Better, Healthier Future",
    "website": "https://velbiom.com",
    "tagline": ""
  },
  "business_summary": {
    "summary": "",
    "offerings": [],
    "segments": []
  },
  "evidence": {
    "social_links": [],
    "priority_pages": [],
    "signals": []
  },
  "contact_details": {
    "emails": [],
    "phones": [
      "39724176225802",
      "1673261809",
      "1673262188",
      "1672666691 1950",
      "1672666691",
      "82185315313628",
      "1672666691 2048",
      "1672836166",
      "1737124344",
      "1.91277735271",
      "13481043045248",
      "1672665328",
      "1662980153",
      "1672817268",
      "1671784796",
      "1672666691 1296",
      "1046348016",
      "10827811164526",
      "8384562888944",
      "17158186378",
      "15795882391562",
      "1671713848",
      "998-051-8484",
      "67207134794593",
      "1671772457",
      "81623144665447",
      "11174276526578",
      "55589419236163",
    