In [None]:
pip install requests beautifulsoup4 lxml


In [None]:
import json
import os
import re
import time
from collections import deque
from dataclasses import dataclass
from typing import Optional, Set, Tuple
from urllib.parse import urljoin, urlparse, urlunparse

import requests
from bs4 import BeautifulSoup
from urllib import robotparser


@dataclass(frozen=True)
class CrawlScope:
    allowed_domain: str               # e.g. "orfalea.calpoly.edu"
    allowed_path_prefix: str          # e.g. "/graduate-programs/"


SKIP_EXTENSIONS = (
    ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
    ".zip", ".rar", ".7z", ".mp4", ".mov", ".mp3", ".wav",
    ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"
)

def normalize_url(url: str) -> str:
    """
    Normalize URLs to reduce duplicates:
    - drop fragments (#...)
    - drop query string (optional; here we drop it)
    - normalize trailing slash (keep as-is but remove redundant)
    """
    parsed = urlparse(url)
    parsed = parsed._replace(fragment="", query="")

    # Normalize netloc to lower
    netloc = parsed.netloc.lower()

    # Normalize path: collapse // and remove trailing slash except root
    path = re.sub(r"/{2,}", "/", parsed.path)
    if path != "/" and path.endswith("/"):
        path = path[:-1]

    parsed = parsed._replace(netloc=netloc, path=path)
    return urlunparse(parsed)


def is_in_scope(url: str, scope: CrawlScope) -> bool:
    p = urlparse(url)
    if p.scheme not in ("http", "https"):
        return False
    if p.netloc.lower() != scope.allowed_domain.lower():
        return False
    return p.path.startswith(scope.allowed_path_prefix)


def looks_like_file(url: str) -> bool:
    lower = url.lower()
    return any(lower.endswith(ext) for ext in SKIP_EXTENSIONS)


def get_robots_parser(base_url: str) -> robotparser.RobotFileParser:
    rp = robotparser.RobotFileParser()
    robots_url = urljoin(base_url, "/robots.txt")
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception:
        # If robots can't be read, default to allowing (your choice).
        pass
    return rp


def fetch_html(session: requests.Session, url: str, timeout: int = 20) -> Optional[str]:
    try:
        resp = session.get(url, timeout=timeout, headers={
            "User-Agent": "Mozilla/5.0 (compatible; RAG-Crawler/1.0; +https://example.com/bot)"
        })
        resp.raise_for_status()
        ctype = resp.headers.get("Content-Type", "")
        if "text/html" not in ctype:
            return None
        return resp.text
    except requests.RequestException:
        return None


def extract_text_and_title(html: str) -> Tuple[str, str]:
    soup = BeautifulSoup(html, "lxml")

    # Remove common boilerplate containers
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    title = (soup.title.get_text(strip=True) if soup.title else "").strip()

    # Try to focus on main content if present
    main = soup.find("main")
    container = main if main else soup

    text = container.get_text(separator="\n")
    lines = [ln.strip() for ln in text.splitlines()]
    # Remove empty/very short lines to reduce noise
    lines = [ln for ln in lines if len(ln) >= 30]
    cleaned = "\n".join(lines)
    return cleaned, title


def extract_links(current_url: str, html: str) -> Set[str]:
    soup = BeautifulSoup(html, "lxml")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()

        # Skip mailto/tel/javascript
        if href.startswith(("mailto:", "tel:", "javascript:")):
            continue

        abs_url = urljoin(current_url, href)
        abs_url = normalize_url(abs_url)
        links.add(abs_url)

    return links


def safe_filename_from_url(url: str) -> str:
    """
    Turn a URL into a stable filename.
    """
    p = urlparse(url)
    path = p.path.strip("/").replace("/", "__")
    if not path:
        path = "root"
    return f"{path}.json"


def crawl_site(
    seed_url: str,
    scope: CrawlScope,
    out_dir: str = "crawl_output",
    max_pages: int = 500,
    delay_seconds: float = 0.5,
    respect_robots: bool = True,
):
    os.makedirs(out_dir, exist_ok=True)

    seed = normalize_url(seed_url)
    base = f"{urlparse(seed).scheme}://{urlparse(seed).netloc}/"

    rp = get_robots_parser(base) if respect_robots else None

    session = requests.Session()

    visited: Set[str] = set()
    queue = deque([seed])

    pages_saved = 0

    while queue and pages_saved < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        visited.add(url)

        if not is_in_scope(url, scope):
            continue
        if looks_like_file(url):
            continue
        if rp and not rp.can_fetch("*", url):
            print(f"ROBOTS SKIP: {url}")
            continue

        print(f"FETCH: {url}")
        html = fetch_html(session, url)
        if not html:
            continue

        text, title = extract_text_and_title(html)

        # Save JSON
        doc = {
            "source_url": url,
            "title": title,
            "content": text,
        }
        fname = safe_filename_from_url(url)
        fpath = os.path.join(out_dir, fname)
        with open(fpath, "w", encoding="utf-8") as f:
            json.dump(doc, f, ensure_ascii=False, indent=2)

        pages_saved += 1

        # Discover more links
        for link in extract_links(url, html):
            if link not in visited and is_in_scope(link, scope) and not looks_like_file(link):
                queue.append(link)

        time.sleep(delay_seconds)

    print(f"\nDone. Visited={len(visited)} Saved={pages_saved} OutputDir={out_dir}")


if __name__ == "__main__":
    SEED = "https://orfalea.calpoly.edu/graduate-programs/ms-business-analytics"
    SCOPE = CrawlScope(
        allowed_domain="orfalea.calpoly.edu",
        allowed_path_prefix="/graduate-programs/"
    )

    crawl_site(
        seed_url=SEED,
        scope=SCOPE,
        out_dir="orfalea_graduate_programs",
        max_pages=300,
        delay_seconds=0.7,
        respect_robots=True
    )


In [None]:
import json
import os

INPUT_DIR = "orfalea_graduate_programs"
OUTPUT_DIR = "orfalea_graduate_programs_txt"

os.makedirs(OUTPUT_DIR, exist_ok=True)

for fname in os.listdir(INPUT_DIR):
    if not fname.endswith(".json"):
        continue

    with open(os.path.join(INPUT_DIR, fname), "r", encoding="utf-8") as f:
        doc = json.load(f)

    title = doc.get("title", "").strip()
    url = doc.get("source_url", "").strip()
    content = doc.get("content", "").strip()

    txt = f"""Title: {title}
URL: {url}

{content}
"""

    out_name = fname.replace(".json", ".txt")
    with open(os.path.join(OUTPUT_DIR, out_name), "w", encoding="utf-8") as out:
        out.write(txt)

print("Conversion complete.")
