In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import warnings
warnings.filterwarnings("ignore")

def scrape_text(url):
    """Fetch and clean visible text from a webpage."""
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    return text

def find_relevant_pages(base_url, keywords):
    """Find pages by keyword relevance."""
    try:
        resp = requests.get(base_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error fetching {base_url}: {e}")
        return [base_url]

    soup = BeautifulSoup(resp.text, "html.parser")
    links = [a.get("href") for a in soup.find_all("a", href=True)]

    relevant_pages = []
    for link in links:
        if any(k in link.lower() for k in keywords):
            full_link = urljoin(base_url, link)
            if full_link not in relevant_pages:
                relevant_pages.append(full_link)

    return relevant_pages

def summarize_text(text, max_sentences=3):
    """Generate a short, crisp company summary."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    keywords = ["about", "vision", "mission", "objective", "focus", "establish",
                "function", "research", "development", "sustain", "industry"]

    scored = []
    for s in sentences:
        score = sum(k in s.lower() for k in keywords)
        if 40 < len(s) < 250:  # avoid junk sentences
            scored.append((score, s.strip()))

    scored.sort(key=lambda x: (-x[0], len(x[1])))
    top_sentences = [s for _, s in scored[:max_sentences]]
    return " ".join(top_sentences)

def extract_contacts(text):
    """Extract emails, phone numbers, and possible addresses."""
    emails = list(set(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)))
    phones = list(set(re.findall(r"(?:\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,5}[-.\s]?\d{3,5}", text)))

    # crude heuristic for address
    address_candidates = re.findall(r"([A-Z][^,]+(?:Street|Strasse|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd|Way|Place|Pl|Square|Sq|Drive|Dr|Building|Bldg|No\.|Kaiserstraße).+)", text, re.IGNORECASE)
    address = address_candidates[0] if address_candidates else None

    return {
        "emails": emails,
        "phones": phones,
        "address": address
    }

def company_profile(base_url):
    """Scrape summary + contacts separately for better results."""
    # Pages for summary
    summary_pages = [base_url] + find_relevant_pages(base_url, ["about", "vision", "mission", "who-we-are", "company", "corporate"])
    
    # Pages for contact info
    contact_pages = find_relevant_pages(base_url, ["contact", "impressum", "imprint"])
    if not contact_pages:
        contact_pages = [base_url]  # fallback

    # Build summary
    summary_text = ""
    for page in summary_pages[:5]:
        summary_text += " " + scrape_text(page)
    summary = summarize_text(summary_text)

    # Extract contact info
    contact_text = ""
    for page in contact_pages[:3]:
        contact_text += " " + scrape_text(page)
    contacts = extract_contacts(contact_text)

    return {
        "url": base_url,
        "summary": summary,
        "contacts": contacts
    }

# Example usage
if __name__ == "__main__":
    urls = ["https://bathcanalcraft.co.uk","https://www.kit.edu/","https://mpob.gov.my/"] 
    for url in urls:
        print(f"\n---Company {url} Summary ---\n")
        profile = company_profile(url)
        print("Website:", profile["url"])
        print("\nSummary:", profile["summary"])
        print("\nEmails:", profile["contacts"]["emails"])
        print("Phones:", profile["contacts"]["phones"])
        print("Address:", profile["contacts"]["address"])



---Company https://bathcanalcraft.co.uk Summary ---

Website: https://bathcanalcraft.co.uk

Summary: From the initial concept to the maiden voyage, we prioritise your vision every step of the way. Are you looking for a beautifully formed narrowboat, uniquely designed to fit your lifestyle? Join us in redefining the future of narrowboat cruising, where tradition meets innovation in harmony with the environment.

Emails: ['bathcanalcraft@icloud.com']
Phones: ['+44 7538 784 613']
Address: Contact - Bath Canal Craft ‍ ‍ Home Boats All Boats NB ‘Alica-Lee‘ (in planning) NB ‘Fram‘ NB ‘Silly Knot To‘ NB ‘Slapdash Lilly‘ NB ‘The Doran‘ (sailaway) NB ‘Elaine‘ NB ‘Velore‘ Contact If you’d like to discuss a personalised narrowboat build then please feel free call, email, or come visit… +44 7538 784 613 bathcanalcraft@icloud.com Bath Canal Craft Ltd. Deverill Storage Longbridge Deverill Warminster Wiltshire BA12 7FB Bath Canal Craft Ltd. | All Rights Reserved © 2025

---Company https://www.kit.ed

***Smart summary about company using facebook HuggingFace model.***

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message=".*max_new_tokens.*")
warnings.filterwarnings("ignore", category=UserWarning)

# Optional JS rendering
try:
    from requests_html import HTMLSession
    JS_RENDER_AVAILABLE = True
except ImportError:
    JS_RENDER_AVAILABLE = False

# Load local summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# summarizer = pipeline("summarization", model="t5-small")
# -------------------------------
# 1. Scrape Website Text
# -------------------------------
def scrape_website(url):
    text = ""
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")
        for script in soup(["script", "style", "noscript"]):
            script.extract()
        text = " ".join(soup.stripped_strings)
    except Exception as e:
        print(f"Normal request failed: {e}")

    # Fallback to JS-rendered
    if len(text.split()) < 50 and JS_RENDER_AVAILABLE:
        try:
            session = HTMLSession()
            r = session.get(url)
            r.html.render(timeout=20)
            text = r.html.text
        except Exception as e:
            print(f"JS render failed: {e}")

    return text

# -------------------------------
# 3. Extract Objectives and Services
# -------------------------------
def extract_objective_service_text(text):
    paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 40]

    # Keywords
    objective_keywords = ["mission", "objective", "goal", "vision", "purpose", "focus", "aim"]
    service_keywords = ["service", "product", "offer", "specialize", "provide", "solutions", "design", "build", "manufacture"]

    # Extract paragraphs
    objective_paragraphs = [p for p in paragraphs if any(k in p.lower() for k in objective_keywords)]
    service_paragraphs = [p for p in paragraphs if any(k in p.lower() for k in service_keywords)]

    combined = objective_paragraphs + service_paragraphs
    if combined:
        return " ".join(combined)
    else:
        # fallback to first few paragraphs
        return " ".join(paragraphs[:10])

# -------------------------------
# 4. Generate Crisp Summary
# -------------------------------
def generate_summary(text, company_name="The company"):
    text = text[:2000]  # HuggingFace token limit
    input_length = len(text.split())
    max_length = min(120, input_length)  # Ensure max_length <= input_length
    min_length = min(40, max_length - 1) if max_length > 40 else 10
    result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return f"**{company_name}**: {result[0]['summary_text']}"

# -------------------------------
# 5. Full Company Analyzer
# -------------------------------
def     analyze_company(url, company_name="The company"):
    text = scrape_website(url)
    objective_service_text = extract_objective_service_text(text)
    if objective_service_text:
        summary = generate_summary(objective_service_text, company_name)
    else:
        summary = "None"
    
    return summary
    

# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    websites = ['https://www.asml.com', 'https://www.liwest.at', 'https://www.novem.com', 'https://www.steinhaus.net', 'https://www.ost.ch', 'https://www.audi.de', 'https://www.landpack.de', 'https://www.schaumaplast.de', 'https://www.zukunfts.haus', 'https://www.googlemail.com', 'https://www.federation.edu.au', 'https://www.ciiae.org', 'https://www.hb-ingenieure.de', 'https://www.windowslive.com', 'https://www.kyocera.jp', 'https://www.elite-tec.com.cn', 'https://www.fineeng.eu', 'https://www.influtherm.com', 'https://www.lengheim-entwicklung.at', 'https://www.synthesis-spa.com', 'https://www.jimdo.de', 'https://www.sonoco.com', 'https://www.izeau.fr', 'https://www.mas-sp.pl', 'https://www.kreon.com',
            'https://www.braebo.lu', 'https://www.tamu.edu', 'https://www.lms-germany.de', 'https://www.lifebiotek.com', 'https://www.tuke.sk', 'https://www.adelaide.edu.au', 'https://www.greyb.com', 'https://www.tuke.sk', 'https://www.heatventors.com', 'https://www.unimore.it', 'https://www.ua.pt', 'https://www.puffinpackaging.co.uk', 'https://www.voltaspace.co', 'https://www.altileo.com', 'https://www.eaf.org.br', 'https://www.cilas.com', 'https://www.lut.fi', 'https://www.utwente.nl', 'https://www.intudiagnostics.com', 'https://www.ucl.ac.uk', 'https://www.sanbs.org.za', 'https://www.rit.edu', 'https://www.pole-cristal.fr', 'https://www.uni-freiburg.de', 'https://www.fusemat.com']


    for url in websites:
        print(f"\n--- Scraping {url} ---\n")
        summary = analyze_company(url)
        print(summary)


Device set to use cpu



--- Scraping https://www.bathcanalcraft.co.uk ---

Summary:
 **https://www.bathcanalcraft.co.uk**: Bath Canal Craft offers a truly bespoke service, working closely with you to ensure you have control over the design decisions that matter most. Join us in redefining the future of narrowboat cruising, where tradition meets innovation in harmony with the environment.

--- Scraping https://www.kit.edu/ ---

Summary:
 **https://www.kit.edu/**: KIT - Karlsruher Institut für Technologie Navigation überspringen Home Leichte Sprache Gebärdensprache Impressum Datenschutz Barrierefreiheit Sitemap Intranet KIT en suchen suchen.

--- Scraping https://www.mpob.gov.my/ ---

Summary:
 **https://www.mpob.gov.my/**: Malaysian Palm Oil Board – 6, Persiaran Institusi, Bandar Baru Bangi 43000 Kajang Selangor, Malaysia CERTIFIED TO ISO 9001:2015 CERT. NO.: QMS 02602 Contact Us Sitemap Menu Corporate Info About Us MPOB Logo Vision & Mission Organisation Chart Clientsâ Charter Top Management Board Members 

In [1]:
import re
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor

# Optional JS rendering
try:
    from requests_html import HTMLSession
    JS_RENDER_AVAILABLE = True
except ImportError:
    JS_RENDER_AVAILABLE = False

# -------------------------------
# Load summarizer once
# -------------------------------
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# summarizer = pipeline("summarization", model="t5-small")
# -------------------------------
# 1. Scrape Website Text
# -------------------------------
def scrape_website(url):
    """Fetch visible text from webpage with headers to avoid 403 errors."""
    text = ""
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = " ".join(soup.stripped_strings)
    except Exception as e:
        print(f"Normal request failed for {url}: {e}")

    # Fallback to JS-rendered
    if len(text.split()) < 50 and JS_RENDER_AVAILABLE:
        try:
            session = HTMLSession()
            r = session.get(url)
            r.html.render(timeout=20)
            text = r.html.text
        except Exception as e:
            print(f"JS render failed for {url}: {e}")

    return text

# -------------------------------
# 2. Extract Objectives & Services
# -------------------------------
def extract_objective_service_text(text):
    paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 40]

    objective_keywords = ["mission", "objective", "goal", "vision", "purpose", "focus", "aim"]
    service_keywords = ["service", "product", "offer", "specialize", "provide", "solutions", "design", "build", "manufacture"]

    objective_paragraphs = [p for p in paragraphs if any(k in p.lower() for k in objective_keywords)]
    service_paragraphs = [p for p in paragraphs if any(k in p.lower() for k in service_keywords)]

    combined = objective_paragraphs + service_paragraphs
    if combined:
        return " ".join(combined)
    return " ".join(paragraphs[:10]) if paragraphs else ""

# -------------------------------
# 3. Generate Crisp Summary
# -------------------------------
def generate_summary(text, company_name="The company"):
    text = text[:2000]  # HuggingFace token limit
    input_length = len(text.split())
    max_length = min(120, input_length)
    min_length = min(40, max_length - 1) if max_length > 40 else 10
    result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return f"**{company_name}**: {result[0]['summary_text']}"

# -------------------------------
# 4. Full Company Analyzer
# -------------------------------
def analyze_company(url, company_name="The company"):
    text = scrape_website(url)
    objective_service_text = extract_objective_service_text(text)
    summary = generate_summary(objective_service_text, company_name) if objective_service_text else "None"
    return {
        "summary": summary
    }

# -------------------------------
# 5. Process Multiple Companies Efficiently
# -------------------------------
def analyze_multiple_companies(websites, max_workers=4):
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_company = {executor.submit(analyze_company, url, name): name for url, name in websites}
        for future in future_to_company:
            name = future_to_company[future]
            try:
                results[name] = future.result()
            except Exception as e:
                results[name] = {"summary": f"Error: {e}"}
    return results

# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    websites = [
        ("http://schaumaplast.de/", "Bath Canal Craft"),
        # ("https://www.kit.edu/", "Karlsruhe Institute of Technology"),
        # ("https://mpob.gov.my/", "Malaysian Palm Oil Board"),
        # ("https://www.nseindia.com/", "NSE India")
    ]

    results = analyze_multiple_companies(websites, max_workers=4)
    for name, data in results.items():
        print(f"\n--- {name} ---\n")
        print("Summary:\n", data["summary"])


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
Device set to use cpu


Normal request failed for http://schaumaplast.de/: HTTPConnectionPool(host='schaumaplast.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000001E65248E610>: Failed to resolve 'schaumaplast.de' ([Errno 11001] getaddrinfo failed)"))

--- Bath Canal Craft ---

Summary:
 None


In [6]:
import re
import requests
from bs4 import BeautifulSoup

def normalize_phone(number: str) -> str:
    number = number.strip()
    number = re.sub(r"[^\d+]", "", number)  # keep only digits and '+'

    # Malaysia example
    if number.startswith("603"):
        return "+60 " + number[2:]
    if number.startswith("+60"):
        return number
    return number

def extract_address(text_lines):
    ignore_keywords = ["skip to content", "menu", "footer", "sitemap", "disclaimer", "policy"]
    address_keywords = r"(Persiaran|Bandar|Selangor|Kajang|Malaysia)"

    for i, line in enumerate(text_lines):
        line_lower = line.lower()
        if any(k in line_lower for k in ignore_keywords):
            continue  # skip UI/footer lines

        if re.search(r"\d+", line) and re.search(address_keywords, line, re.IGNORECASE):
            # include next line if short and not ignored
            candidate = line
            if i + 1 < len(text_lines):
                next_line = text_lines[i+1].strip()
                if len(next_line) < 80 and not any(k in next_line.lower() for k in ignore_keywords):
                    candidate += " " + next_line
            return candidate.strip()
    return None


def scrape_contact_info(url):
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return {}

    soup = BeautifulSoup(resp.text, "html.parser")

    # Replace <br> with newline to preserve line separation
    for br in soup.find_all("br"):
        br.replace_with("\n")

    # Extract visible text lines
    text_lines = [line.strip() for line in soup.stripped_strings if line.strip()]

    # ----------------- Extract emails -----------------
    emails = set()
    # from <a href="mailto:...">
    for a in soup.find_all("a", href=True):
        if a['href'].lower().startswith("mailto:"):
            emails.add(a['href'].split(":", 1)[1].strip())
    # fallback: regex in text
    for line in text_lines:
        for e in re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", line):
            emails.add(e)

    # ----------------- Extract phones -----------------
    phones = set()
    for a in soup.find_all("a", href=True):
        if a['href'].lower().startswith("tel:"):
            phones.add(normalize_phone(a['href'].split(":", 1)[1].strip()))
    # fallback: regex in text
    phone_pattern = r"(?:Tel|Phone|Telefon)[:.]?\s*([+0-9\s\-]{6,})"
    for line in text_lines:
        match = re.search(phone_pattern, line, re.IGNORECASE)
        if match:
            phones.add(normalize_phone(match.group(1)))

    # ----------------- Extract fax -----------------
    fax_numbers = set()
    fax_pattern = r"(?:Fax)[:.]?\s*([+0-9\s\-]{6,})"
    for line in text_lines:
        match = re.search(fax_pattern, line, re.IGNORECASE)
        if match:
            fax_numbers.add(normalize_phone(match.group(1)))

    # ----------------- Extract address -----------------
    address_keywords = r"(Persiaran|Bandar|Selangor|Kajang|Malaysia)"
    # Replace the previous address parsing with:
    address = extract_address(text_lines)

    return {
        "emails": list(emails),
        "phones": list(phones),
        "fax": list(fax_numbers),
        "address": address
    }

# -------------------------------
# Example
# -------------------------------
if __name__ == "__main__":
    websites = ['https://www.bathcanalcraft.co.uk',
        "https://www.kit.edu/", 
        "https://www.mpob.gov.my/",
        "https://www.nseindia.com/", 'https://www.research.gla.ac.uk', 
        'https://www.bebob.de', 'https://www.as.gov.qa', 'https://www.airjouletech.com',
        'https://www.moser-konstruktion.de', 'https://www.ac.sce.ac.il'
    ]

    for url in websites:
        print(f"\n--- Scraping {url} ---\n")
        summary = scrape_contact_info(url)
        print(summary)


--- Scraping https://www.bathcanalcraft.co.uk ---

{'emails': [], 'phones': [], 'fax': [], 'address': None}

--- Scraping https://www.kit.edu/ ---

{'emails': [], 'phones': [], 'fax': [], 'address': None}

--- Scraping https://www.mpob.gov.my/ ---

{'emails': ['general@mpob.gov.my'], 'phones': ['+60 387694400'], 'fax': ['+60 389259446'], 'address': 'Malaysian Palm Oil Board – 6, Persiaran Institusi, Bandar Baru Bangi<br>43000 Kajang Selangor, Malaysia'}

--- Scraping https://www.nseindia.com/ ---

{'emails': [], 'phones': [], 'fax': [], 'address': None}

--- Scraping https://www.research.gla.ac.uk ---

Error fetching https://www.research.gla.ac.uk: HTTPSConnectionPool(host='www.research.gla.ac.uk', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E31EFB1B10>: Failed to resolve 'www.research.gla.ac.uk' ([Errno 11001] getaddrinfo failed)"))
{}

--- Scraping https://www.bebob.de ---

Error fetching https://w

In [8]:
import requests
from bs4 import BeautifulSoup
import re

def extract_address_from_text(text_lines):
    # Keywords commonly found in addresses
    address_keywords = [
        'street', 'strasse', 'road', 'rd', 'avenue', 'ave', 'lane', 'ln', 'boulevard', 'blvd',
        'way', 'place', 'pl', 'square', 'sq', 'drive', 'dr', 'building', 'bldg', 'no.', 'nr.',
        'suite', 'block', 'kaiserstraße', 'jalan', 'city', 'state', 'zip', 'postcode', 'country'
    ]
    postal_code_pattern = re.compile(r'\b\d{4,6}\b')

    for line in text_lines:
        line_lower = line.lower()
        if any(k in line_lower for k in address_keywords) or postal_code_pattern.search(line):
            return line.strip()
    return None

def scrape_address(url):
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")
    # Replace <br> with newline to preserve line separation
    for br in soup.find_all("br"):
        br.replace_with("\n")

    # Extract visible text lines
    text_lines = [line.strip() for line in soup.stripped_strings if line.strip()]

    address = extract_address_from_text(text_lines)
    return address

# Example usage
websites = [
    'https://www.bathcanalcraft.co.uk',
    'https://www.kit.edu/',
    'https://www.mpob.gov.my/',
    'https://www.nseindia.com/',
    'https://www.research.gla.ac.uk',
    'https://www.bebob.de',
    'https://www.as.gov.qa',
    'https://www.airjouletech.com',
    'https://www.moser-konstruktion.de',
    'https://www.ac.sce.ac.il'
]

for url in websites:
    print(f"\n--- Scraping address from {url} ---")
    address = scrape_address(url)
    print("Address:", address if address else "Not found")


--- Scraping address from https://www.bathcanalcraft.co.uk ---
Address: NB ‘Alica-Lee‘ (in planning)

--- Scraping address from https://www.kit.edu/ ---
Address: Gebärdensprache

--- Scraping address from https://www.mpob.gov.my/ ---
Address: Malaysian Palm Oil Board – 6, Persiaran Institusi, Bandar Baru Bangi<br>43000 Kajang Selangor, Malaysia

--- Scraping address from https://www.nseindia.com/ ---
Address: 04-Sep-2025 14:46 IST

--- Scraping address from https://www.research.gla.ac.uk ---
Error fetching https://www.research.gla.ac.uk: HTTPSConnectionPool(host='www.research.gla.ac.uk', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E31EA7CB90>: Failed to resolve 'www.research.gla.ac.uk' ([Errno 11001] getaddrinfo failed)"))
Address: Not found

--- Scraping address from https://www.bebob.de ---
Error fetching https://www.bebob.de: HTTPSConnectionPool(host='www.bebob.de', port=443): Max retries exceeded

In [9]:
import requests
from bs4 import BeautifulSoup
import re

def extract_full_address(text_lines):
    # Keywords commonly found in addresses
    address_keywords = [
        'street', 'strasse', 'road', 'rd', 'avenue', 'ave', 'lane', 'ln', 'boulevard', 'blvd',
        'way', 'place', 'pl', 'square', 'sq', 'drive', 'dr', 'building', 'bldg', 'no.', 'nr.',
        'suite', 'block', 'kaiserstraße', 'jalan', 'city', 'state', 'zip', 'postcode', 'country',
        'post', 'mail', 'address', 'p.o.', 'po box', 'plz', 'ort', 'hausnummer', 'straße', 'str.'
    ]
    postal_code_pattern = re.compile(r'\b\d{4,6}\b')

    addresses = []
    block = []
    for line in text_lines:
        line_lower = line.lower()
        # Check if line is part of an address block
        if any(k in line_lower for k in address_keywords) or postal_code_pattern.search(line):
            block.append(line.strip())
        else:
            if block:
                # If block has at least street, city, and postal code, consider it a full address
                block_text = ' '.join(block)
                if postal_code_pattern.search(block_text) and any(k in block_text.lower() for k in address_keywords):
                    addresses.append(block_text)
                block = []
    # Catch any trailing block
    if block:
        block_text = ' '.join(block)
        if postal_code_pattern.search(block_text) and any(k in block_text.lower() for k in address_keywords):
            addresses.append(block_text)
    return addresses if addresses else None

def scrape_full_address(url):
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")
    for br in soup.find_all("br"):
        br.replace_with("\n")

    text_lines = [line.strip() for line in soup.stripped_strings if line.strip()]

    addresses = extract_full_address(text_lines)
    return addresses

# Example usage
websites = [
    'https://www.bathcanalcraft.co.uk',
    'https://www.kit.edu/',
    'https://www.mpob.gov.my/',
    'https://www.nseindia.com/',
    'https://www.research.gla.ac.uk',
    'https://www.bebob.de',
    'https://www.as.gov.qa',
    'https://www.airjouletech.com',
    'https://www.moser-konstruktion.de',
    'https://www.ac.sce.ac.il'
]

for url in websites:
    print(f"\n--- Scraping full address from {url} ---")
    addresses = scrape_full_address(url)
    if addresses:
        for addr in addresses:
            print("Full Address:", addr)
    else:
        print("Full Address: Not found")


--- Scraping full address from https://www.bathcanalcraft.co.uk ---
Full Address: Not found

--- Scraping full address from https://www.kit.edu/ ---
Full Address: KIT. Ort der Zukunft. Seit 1825.
Full Address: Save the Date: KIT Science Week 2025 startet im Oktober.
Full Address: 04. September 2025 13:30 - 16:30 Fortbildungszentrum für Technik und Umwelt (FTU)
Full Address: Für eine Million Jahre muss der Müll des Atomzeitalters, also abgebrannte Brennstäbe aus Kernkraftwerken und andere radioaktive Materialien, in Deutschland sicher verwahrt werden. Die hochradioaktiven Abfälle sollen tief unter der Erde gelagert werden – sicher verschlossen und unzugänglich für zukünftige Generationen. Aber wer entscheidet eigentlich, wo und wie so ein Lager entsteht?

 

Die Ausstellung „Wer entscheidet denn sowas? Endlager und Demokratie“ des Instituts für Technikfolgenabschätzung und Systemanalyse (ITAS) des KIT lädt dazu ein, sich mit dieser Frage auseinanderzusetzen. Kooperationspartner sind da