In [None]:
!pip install lxml
!pip install chromadb
!pip install requests beautifulsoup4 selenium webdriver-manager



In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

In [None]:
s = BeautifulSoup("<root><item>Hello</item></root>", "xml")
print(s.prettify())

<?xml version="1.0" encoding="utf-8"?>
<root>
 <item>
  Hello
 </item>
</root>



In [None]:
def fetch_sitemap_urls(sitemap_url):
    try:
        response = requests.get(sitemap_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "xml")
        urls = [loc.text for loc in soup.find_all("loc")]
        return urls
    except Exception as e:
        print(f"Error fetching {sitemap_url}: {e}")
        return []

# Sitemap URLs
main_sitemap = "https://www.changiairport.com/sitemap.xml"
# nowboarding_sitemap = "https://nowboarding-xml.changiairport.com/pro-sitemaps-4227446.php?sn=sitemap.xml"

# Fetch URLs
main_urls = fetch_sitemap_urls(main_sitemap)
nowboarding_urls = fetch_sitemap_urls(nowboarding_sitemap)

with open("main_changiairport_links.txt", "w", encoding="utf-8") as f:
    for url in main_urls:
        f.write(url + "\n")

# with open("nowboarding_links.txt", "w", encoding="utf-8") as f:
#     for url in nowboarding_urls:
#         f.write(url + "\n")

print(f"Fetched {len(main_urls)} main site URLs.")
# print(f"Fetched {len(nowboarding_urls)} NowBoarding blog URLs.")


Error fetching https://nowboarding-xml.changiairport.com/pro-sitemaps-4227446.php?sn=sitemap.xml: HTTPSConnectionPool(host='nowboarding-xml.changiairport.com', port=443): Max retries exceeded with url: /pro-sitemaps-4227446.php?sn=sitemap.xml (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:992)')))
✅ Fetched 2889 main site URLs.
✅ Fetched 0 NowBoarding blog URLs.


In [None]:

with open('main_changiairport_links.txt', 'r') as file:
    urls = file.readlines()

In [None]:
MAX_RETRIES = 3
DELAY_SECONDS = 1.5
OUTPUT_FILE = "changi_airport_content_6.jsonl"

# User agents for request header randomization
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5"
    "37.36 Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

with open('main_changiairport_links.txt', 'r') as file:
    urls = [url.strip() for url in file.readlines()]

# Setup headless browser for JS-rendered content

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

# Fix: Use `service=` instead of positional argument
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
def is_likely_dynamic_html(html):
    return (
        '<div id="root">' in html or 
        html.count("<script") > 10 or 
        len(html.strip()) < 2000
    )

def scrape_page(url):
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        html = response.text

        if is_likely_dynamic_html(html):
            raise Exception("Likely dynamic, fallback to Selenium")

        return BeautifulSoup(html, "html.parser"), "requests"

    except Exception:
        try:
            driver.get(url)
            time.sleep(5)  # wait for JS content to load
            return BeautifulSoup(driver.page_source, "html.parser"), "selenium"
        except Exception as se:
            raise Exception(f"Selenium failed: {se}")

results = []
for idx, url in enumerate(urls):
    try:
        soup, method = scrape_page(url)

        for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
            tag.decompose()

        title = soup.title.string.strip() if soup.title else ""
        h1 = [tag.text.strip() for tag in soup.find_all("h1")]
        h2 = [tag.text.strip() for tag in soup.find_all("h2")]
        paragraphs = [tag.text.strip() for tag in soup.find_all("p")]
        lists = [tag.text.strip() for tag in soup.find_all(["ul", "ol", "li"])]

        content = "\n".join(h1 + h2 + paragraphs + lists).strip()

        if content:
            page_data = {
                "url": url,
                "title": title,
                "h1": h1,
                "h2": h2,
                "paragraphs": paragraphs,
                "lists": lists
            }
            results.append(page_data)
            print(f"✅ [{idx+1}] Scraped ({method}): {url}")
        else:
            print(f"⚠️ [{idx+1}] No meaningful content: {url}")

        time.sleep(DELAY_SECONDS)

    except Exception as e:
        print(f"[{idx+1}] Failed to scrape {url}: {e}")

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

driver.quit()
print(f"\n Finished scraping {len(results)} out of {len(urls)} URLs.")


✅ [1] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/shower-and-spa-services.html
✅ [2] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/swimming-pool-with-jacuzzi.html
✅ [3] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/banking-services-and-cash-machines.html
✅ [4] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/transit-hotels.html
✅ [5] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/berelax-spa.html
✅ [6] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/singapore-city-sightseeing-hop-on-hop-off-tours.html
✅ [7] Scraped (selenium): https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/clinics-and-pharmacies.html
✅ [8] Scraped (selenium): https://www.c

In [None]:
# # Process each URL
# for url in urls:
#     url = url.strip()  # Remove any leading/trailing whitespace
#     response = requests.get(url)
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         # Extract and process data here
#         # For example, print the title of the page
#         print(f"Title of {url}: {soup.title.text}")
#     else:
#         print(f"Failed to retrieve data from {url}")