In [None]:
# import scraping libraries
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# init variables and containers
BASE_URL = "https://www.sunmarke.com"
visited = set()
documents = {}
menu_dict = {}

In [None]:
# parse homepage menu and collect links
homepage = requests.get(BASE_URL).text
soup = BeautifulSoup(homepage, "html.parser")


top_lis = soup.select("#menu-website-menu-1.sf-menu > li")

for li in top_lis:
    top_label = li.find("a").get_text(strip=True)
    
    # Get all links under this <li> recursively
    # urls = [urljoin(BASE_URL, a["href"]) for a in li.find_all("a", href=True)]
    urls = [a['href'] for a in li.find_all("a", href=True) if "class" not in a.attrs]

    # Special case: Contact Us (or any top-level that is a real page)
    if top_label.lower() == "contact us":
        urls.insert(0, li.find("a")["href"])  # add its own link at the beginning
    
    # Store under top-level label
    menu_dict[top_label] = urls

In [None]:
# debug print menu summary
print("total items: ", len(menu_dict))
for item in menu_dict.items():
    print(item[0], " : ", len(item[1]))

total items:  8
About  :  11
Learning  :  29
Signature Programmes  :  9
Admissions  :  7
For Parents  :  10
Activities  :  3
News & Events  :  4
Contact Us  :  2


In [None]:
# list menu items for inspection
for keys, values in menu_dict.items():
    print(keys, values)

About ['https://www.sunmarke.com/about/principals-message/', 'https://www.sunmarke.com/about/mission-vision-values/', 'https://www.sunmarke.com/about/a-positive-education-school/', 'https://www.sunmarke.com/about/leadership/', 'https://www.sunmarke.com/about/academic-results/', 'https://www.sunmarke.com/about/inspection-reports/', 'https://www.sunmarke.com/about/our-achievements/', 'https://www.sunmarke.com/about/sunmarke-alumni/', 'https://www.sunmarke.com/about/wellbeing/', 'https://www.sunmarke.com/about/the-achievement-centre-inclusion-department/', 'https://www.sunmarke.com/about/our-campus/']
Learning ['https://www.sunmarke.com/learning/nursery/our-approach/', 'https://www.sunmarke.com/learning/nursery/enriched-learning/', 'https://www.sunmarke.com/learning/eyfs/our-approach-eyfs/', 'https://www.sunmarke.com/learning/eyfs/early-years-curriculum/', 'https://www.sunmarke.com/learning/eyfs/eyfs-enriched-learning/', 'https://www.sunmarke.com/learning/primary/our-approach-primary/', '

In [None]:
# prepare data store and headers
data = {key.lower(): [] for key in menu_dict}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

In [None]:
# iterate URLs, extract main content with Trafilatura/BeautifulSoup
for label, urls in menu_dict.items():
    for url in urls:
        try:
            print(f"Scraping {url}")
            html = requests.get(url, headers=headers)
            
            # Extract clean text using Trafilatura
            soup = BeautifulSoup(html.text, "html.parser")

            # pick the div
            main_div = soup.find("div", class_="gdlr-core-page-builder-body clearfix")
            if main_div:
                text = main_div.get_text(separator="\n", strip=True)

            else:
                tags = soup.find_all(["p", "h1", "h2"])
                text = " ".join([t.get_text(strip=True) for t in tags if t.get_text(strip=True)])
            
            if text and len(text.strip()) > 300:  # skip tiny/empty pages
                document = {
                    "url": url,
                    "content": text.strip()
                }
                print(f"  ✅ Extracted {len(text.split())} words")
                data[label.lower()].append(document)
                time.sleep(10)
            else:
                print(f"  ⚠️ No meaningful text found")
        except Exception as e:
            print(f"  ❌ Error scraping {url}: {e}")

Scraping https://www.sunmarke.com/about/principals-message/
  ✅ Extracted 457 words
Scraping https://www.sunmarke.com/about/mission-vision-values/
  ✅ Extracted 494 words
Scraping https://www.sunmarke.com/about/a-positive-education-school/
  ✅ Extracted 625 words
Scraping https://www.sunmarke.com/about/leadership/
  ✅ Extracted 447 words
Scraping https://www.sunmarke.com/about/academic-results/
  ✅ Extracted 275 words
Scraping https://www.sunmarke.com/about/inspection-reports/
  ✅ Extracted 295 words
Scraping https://www.sunmarke.com/about/our-achievements/
  ✅ Extracted 511 words
Scraping https://www.sunmarke.com/about/sunmarke-alumni/
  ✅ Extracted 307 words
Scraping https://www.sunmarke.com/about/wellbeing/
  ✅ Extracted 375 words
Scraping https://www.sunmarke.com/about/the-achievement-centre-inclusion-department/
  ✅ Extracted 480 words
Scraping https://www.sunmarke.com/about/our-campus/
  ✅ Extracted 79 words
Scraping https://www.sunmarke.com/learning/nursery/our-approach/
  ✅ Ext

In [None]:
# save extracted documents to data.json
# save data to a json
import json

with open("data.json", "w") as file:
    json.dump(data, file, indent=4)