In [17]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urljoin, urlparse
from datetime import datetime

# --- CONFIGURATION: Signals Database ---
SIGNALS_DB = {
    "Generic": [
        "Case Study", "Testimonials", "Awards", "Certifications", "Clients", 
        "Partners", "ISO 9001", "Patent", "Copyright", "All Rights Reserved"
    ],
    "Probiotics_Specific": [
        "CFU", "Strain", "Lactobacillus", "Bifidobacterium", "L. casei", "Shirota",
        "Spore-forming", "Lyophilization", "Microencapsulation", "Fermentation", 
        "Gut Health", "Immunity", "Clinical Trial", "Science", "Research", 
        "GRAS", "FSSAI", "GMP Certified"
    ]
}

class UniversalScraper:
    def __init__(self, base_url):
        self.base_url = base_url if base_url.startswith("http") else "https://" + base_url
        self.domain = urlparse(self.base_url).netloc
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Output Structure
        self.data = {
            "A_Identity": {"Company_Name": None, "Website": self.base_url, "Tagline": None},
            "B_Business_Summary": {"What_They_Do": "", "Primary_Offerings": [], "Target_Segments": []},
            "C_Evidence": {"Key_Pages_Detected": [], "Signals_Found": [], "Social_Links": []},
            "D_Contact_Location": {"Emails": [], "Phones": [], "Address": None, "Contact_URL": None},
            "E_Team_Hiring": {"Careers_URL": None, "Roles_Mentioned": []},
            "F_Metadata": {"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Pages_Visited": [], "Errors": []}
        }
        self.visited = set()
        self.max_pages = 12

    def get_soup(self, url):
        """Fetches URL with basic limits."""
        if url in self.visited or len(self.visited) >= self.max_pages: return None
        self.visited.add(url)
        self.data["F_Metadata"]["Pages_Visited"].append(url)
        try:
            resp = self.session.get(url, timeout=10)
            if resp.status_code == 200: return BeautifulSoup(resp.text, 'html.parser')
        except Exception as e:
            self.data["F_Metadata"]["Errors"].append(f"Failed {url}: {str(e)}")
        return None

    def get_text_safe(self, soup):
        """Prevents text merging by adding space separators."""
        if not soup: return ""
        return re.sub(r'\s+', ' ', soup.get_text(separator=' ')).strip()

    def clean_text(self, text):
        return re.sub(r'\s+', ' ', text).strip()

    # --- CORE LOGIC 1: Contact & Address Extraction ---
    def extract_contacts(self, soup):
        """Accepts SOUP object to handle footer text separation correctly."""
        text = self.get_text_safe(soup)
        
        # 1. PHONE EXTRACTION (Strict Logic)
        phone_patterns = [
            r'(?:Ph|Phone|Tel|Fax|Call|Mobile)[:\-\.\s]{0,3}([\+\(]?\d{1,4}[\)\s\.-]{0,3}\d{2,5}[\s\.-]?\d{3,4}[\s\.-]?\d{3,5})', # Labeled
            r'(?:\+91|011|022|044)[\s\-]??\d{3,5}[\s\-]??\d{3,5}', # Indian Landlines
            r'1800[\s-]?\d{3}[\s-]?\d{3,4}' # Toll Free
        ]
        for pat in phone_patterns:
            matches = re.findall(pat, text, re.IGNORECASE)
            for m in matches:
                clean_num = re.sub(r'[^\d+]', '', m)
                if 8 <= len(clean_num) <= 14:
                    self.data["D_Contact_Location"]["Phones"].append(m.strip())

        # 2. EMAIL EXTRACTION
        self.data["D_Contact_Location"]["Emails"].extend(
            re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
        )

        # 3. ADDRESS EXTRACTION (Robust Context + Pincode Logic)
        if not self.data["D_Contact_Location"]["Address"]:
            mandatory_context = [
                "New Delhi", "Mumbai", "Bangalore", "Bengaluru", "Chennai", "Kolkata", 
                "Hyderabad", "Pune", "Gurgaon", "Noida", "Baddi", "Vadodara", "Nashik", 
                "India", "State", "City", "District", "Tamil Nadu", "Delhi", "Egmore"
            ]
            markers = [
                "Plot No", "Shop No", "Unit", "Building", "Tower", "Floor", 
                "Sector", "Phase", "Block", "Industrial Area", "Industrial Estate", 
                "MIDC", "GIDC", "Street", "Road", "Rd", "Lane", "Opp", "Near", 
                "Jhaver", "Complex" # Added specific markers for Tablets India
            ]

            # Regex matches 6-digit Pincodes, allowing optional space (e.g., 600 008)
            zip_matches = re.finditer(r'\b\d{3}\s?\d{3}\b', text)
            
            for z in zip_matches:
                end_pos = z.end()
                start_pos = max(0, end_pos - 200)
                candidate = text[start_pos:end_pos]
                
                # Check 1: Must contain City/State
                if any(c.lower() in candidate.lower() for c in mandatory_context):
                    # Check 2: Must contain an Address Marker
                    if any(m.lower() in candidate.lower() for m in markers):
                        
                        # Find the start of the address
                        pattern = r'(?:' + '|'.join(re.escape(m) for m in markers) + r').+'
                        match = re.search(pattern, candidate, re.IGNORECASE)
                        if match:
                            final_addr = match.group(0).strip()
                            if "et al" not in final_addr and len(final_addr) > 15:
                                self.data["D_Contact_Location"]["Address"] = final_addr + " " + z.group(0)
                                break

    # --- CORE LOGIC 2: Hybrid Offering Extractor ---
    def extract_offerings_hybrid(self, soup):
        """Tiered extraction: Links (Catalogs) -> Headers (Services) -> Images (Visual Sites)."""
        offerings = []
        blacklist = [
            "learn more", "read more", "view all", "add to cart", "buy now", 
            "quick view", "subscribe", "login", "search", "menu", "account",
            "recently viewed", "related products", "description", "ingredients"
        ]
        
        # TIER 1: Link-Based (Catalog Sites)
        for a in soup.find_all('a', href=True):
            href = a['href'].lower()
            text = self.clean_text(a.text)
            if 4 < len(text) < 50:
                if any(k in href for k in ['/product', '/item', '/shop', '/store']):
                    if not any(b in text.lower() for b in blacklist):
                        offerings.append(text)

        # TIER 2: Header-Based (Service/Brand Sites)
        for tag in soup.find_all(['h3', 'h4', 'h5']):
            if tag.find_parent(['nav', 'footer', 'header']): continue
            text = self.clean_text(tag.text)
            if 5 < len(text) < 45 and not any(b in text.lower() for b in blacklist):
                if text[0].isupper(): offerings.append(text)

        # TIER 3: Visual/Alt-Text (Fallback)
        if len(offerings) < 3:
            for img in soup.find_all('img', alt=True):
                alt = self.clean_text(img['alt'])
                if any(k in alt.lower() for k in ['bottle', 'pack', 'capsule', 'sachet', 'powder']):
                    if 5 < len(alt) < 40: offerings.append(alt)

        return list(set(offerings))[:15]

    def extract_signals(self, text):
        for cat, keywords in SIGNALS_DB.items():
            found = [k for k in keywords if k.lower() in text.lower()]
            self.data["C_Evidence"]["Signals_Found"].extend(found)

    # --- EXECUTION LOOP ---
    def run(self):
        print(f"ðŸš€ Scanning {self.base_url}...")
        soup_home = self.get_soup(self.base_url)
        if not soup_home: return {"Error": "Site Unreachable"}

        # 1. Identity
        self.data["A_Identity"]["Company_Name"] = soup_home.title.string.split('|')[0].strip() if soup_home.title else self.domain
        meta = soup_home.find('meta', attrs={'name': 'description'})
        if meta: self.data["A_Identity"]["Tagline"] = meta['content'].strip()

        # 2. Initial Scan (Home) - PASSING SOUP for text fusion fix
        self.extract_contacts(soup_home)
        self.extract_signals(self.get_text_safe(soup_home))

        # 3. Intelligent Navigation Queue
        priority_queue = []
        nav_map = {
            "contact": "Contact", "support": "Contact",
            "about": "About", "story": "About", "heritage": "About",
            "career": "Careers", "job": "Careers",
            "product": "Products", "shop": "Products", "flavour": "Products",
            "science": "Science", "research": "Science", "technology": "Science"
        }

        for a in soup_home.find_all('a', href=True):
            href = urljoin(self.base_url, a['href'])
            text = a.text.lower()
            
            # Socials
            if any(x in href for x in ['linkedin', 'twitter', 'facebook', 'instagram', 'youtube']):
                self.data["C_Evidence"]["Social_Links"].append(href)
                continue

            # Internal Pages
            if self.domain in href:
                for kw, category in nav_map.items():
                    if kw in text or kw in href:
                        entry = f"{category}: {href}"
                        if entry not in self.data["C_Evidence"]["Key_Pages_Detected"]:
                            self.data["C_Evidence"]["Key_Pages_Detected"].append(entry)
                        
                        if category in ["Contact", "Products", "Science", "About"]:
                            priority_queue.append((href, category))
                        if category == "Contact": self.data["D_Contact_Location"]["Contact_URL"] = href
                        if category == "Careers": self.data["E_Team_Hiring"]["Careers_URL"] = href
                        break

        # 4. Deep Crawl
        for url, cat in priority_queue[:8]:
            sub_soup = self.get_soup(url)
            if not sub_soup: continue
            
            # Extract Contacts on sub-pages (Critical for Contact pages)
            self.extract_contacts(sub_soup)
            self.extract_signals(self.get_text_safe(sub_soup))

            if cat in ["Products", "Science"]:
                items = self.extract_offerings_hybrid(sub_soup)
                self.data["B_Business_Summary"]["Primary_Offerings"].extend(items)
            
            if cat == "About" and not self.data["B_Business_Summary"]["What_They_Do"]:
                paras = [p.text for p in sub_soup.find_all('p') if len(p.text) > 80]
                if paras: self.data["B_Business_Summary"]["What_They_Do"] = self.clean_text(paras[0])

        # 5. Final Deduplication
        for section in ["C_Evidence", "D_Contact_Location", "B_Business_Summary"]:
            for key, val in self.data[section].items():
                if isinstance(val, list):
                    self.data[section][key] = list(set(val))
        
        return self.data

# --- RUN EXAMPLE ---
if __name__ == "__main__":
    # Change URL to test different sites
    # url = "https://velbiom.com"
    # url = "https://yakult.co.in"
    url = "https://tabletsindia.com"
    
    scraper = UniversalScraper(url) 
    print(json.dumps(scraper.run(), indent=4))

ðŸš€ Scanning https://tabletsindia.com...
{
    "A_Identity": {
        "Company_Name": "Tablets India",
        "Website": "https://tabletsindia.com",
        "Tagline": "Tablets India offers advanced probiotics, nutraceuticals and pharma solutions designed to support immunity, gut health and overall wellness for all age groups."
    },
    "B_Business_Summary": {
        "What_They_Do": "Tablets (India) Limited was one of the early pharma ventures in India and was founded in 1938 by Mr Sri Krishna Jhaver.",
        "Primary_Offerings": [
            "Capsule in capsule",
            "Sachet in Sachet",
            "Oral Dispersible Powder"
        ],
        "Target_Segments": []
    },
    "C_Evidence": {
        "Key_Pages_Detected": [
            "Science: https://tabletsindia.com/clinical-research",
            "Contact: https://tabletsindia.com/contact-us",
            "About: https://tabletsindia.com/about-us",
            "Careers: https://tabletsindia.com/careers",
          