In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse, parse_qsl, urlencode
import time

base_url = 'https://www.musashino-u.ac.jp/'
domain = 'musashino-u.ac.jp'

visited_urls = set()
urls_to_visit = [base_url]
url_title_dict = {}

# 除外するパス（portal配下など）
excluded_paths = [
    '/portal/',  # 必要に応じて追加
]

# 除去するクエリパラメータ（追跡/動的）
drop_query_keys = {
    'utm_source','utm_medium','utm_campaign','utm_term','utm_content',
    'gclid','fbclid','mc_cid','mc_eid','yclid'
}

def normalize_url(current, href):
    # 絶対URL化
    abs_url = urljoin(current, href)

    # ;jsessionid=... を除去（パスセグメントのパラメータ部分）
    # URL構造: scheme, netloc, path;params, query, fragment
    parsed = urlparse(abs_url)
    # path に ;param が含まれるケースを除去（一般に urlparse は params フィールドを使うが、;が path に付くことも）
    path = parsed.path.split(';')[0]  # ;以降を落とす

    # クエリの正規化（並び替え、不要キーを排除）
    query_pairs = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) if k not in drop_query_keys]
    query_pairs.sort()
    norm_query = urlencode(query_pairs, doseq=True)

    # フラグメントは落とす
    normalized = urlunparse((parsed.scheme, parsed.netloc, path, '', norm_query, ''))
    return normalized

def is_same_domain(url):
    return urlparse(url).netloc.endswith(domain)

def is_excluded(url):
    p = urlparse(url).path
    return any(p.startswith(excl) for excl in excluded_paths)

def is_asset(url):
    lower = url.lower()
    return lower.endswith((
        '.pdf','.jpg','.jpeg','.png','.gif','.zip','.rar','.7z',
        '.mp3','.mp4','.mov','.avi','.wmv','.mkv',
        '.doc','.docx','.xls','.xlsx','.ppt','.pptx','.csv',
        '.svg','.webp','.ico'
    ))

session = requests.Session()
session.headers.update({
    'User-Agent': 'SimpleCrawler/1.0 (+for study; contact@example.com)'
})

while urls_to_visit:
    current_url = urls_to_visit.pop(0)
    if current_url in visited_urls:
        continue
    visited_urls.add(current_url)

    print(f"訪問中: {current_url}")

    try:
        resp = session.get(current_url, timeout=10, allow_redirects=True)
        if resp.status_code != 200:
            continue

        # HTMLのみ処理
        ctype = resp.headers.get('Content-Type', '')
        if 'text/html' not in ctype:
            continue

        resp.encoding = resp.apparent_encoding
        soup = BeautifulSoup(resp.text, 'html.parser')

        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else 'タイトルなし'
        url_title_dict[current_url] = title

        for a in soup.find_all('a', href=True):
            if a.get('rel') and 'nofollow' in [r.lower() for r in a['rel']]:
                continue
            cand = normalize_url(current_url, a['href'])
            if not is_same_domain(cand):
                continue
            if is_excluded(cand):
                continue
            if is_asset(cand):
                continue
            if cand not in visited_urls and cand not in urls_to_visit:
                urls_to_visit.append(cand)

    except Exception as e:
        print(f"エラー: {current_url} - {e}")

print(url_title_dict)

訪問中: https://www.musashino-u.ac.jp/
訪問中: https://ef.musashino-u.ac.jp/donation/
訪問中: https://www.musashino-u.ac.jp/access.html
訪問中: https://www.musashino-u.ac.jp/admission/request.html
訪問中: https://www.musashino-u.ac.jp/contact.html
訪問中: https://www.musashino-u.ac.jp/prospective-students.html
訪問中: https://www.musashino-u.ac.jp/students.html
訪問中: https://www.musashino-u.ac.jp/alumni.html
訪問中: https://www.musashino-u.ac.jp/parents.html
訪問中: https://www.musashino-u.ac.jp/business.html
訪問中: https://www.musashino-u.ac.jp/guide/
訪問中: https://www.musashino-u.ac.jp/guide/profile/
訪問中: https://www.musashino-u.ac.jp/guide/activities/
訪問中: https://www.musashino-u.ac.jp/guide/campus/
訪問中: https://www.musashino-u.ac.jp/guide/facility/
訪問中: https://www.musashino-u.ac.jp/guide/information/
訪問中: https://www.musashino-u.ac.jp/guide/profile/media/
訪問中: https://www.musashino-u.ac.jp/admission/
訪問中: https://www.musashino-u.ac.jp/admission/faculty/
訪問中: https://www.musashino-u.ac.jp/admission/international