<a href="https://colab.research.google.com/github/Mynn2/EEOC-Newsroom-Summarizer/blob/main/EEOC_newsroom_scraper_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# [스텝 0] 필요한 라이브러리 설치 및 기본 세팅 / Install deps & basic settings
%pip install --quiet transformers sentencepiece torch requests beautifulsoup4 pandas  # 라이브러리 설치 / install libs

# [스텝 1] 임포트와 전역 설정 / Imports & globals
import re, time, html, datetime as dt, os  # 유틸 / utils
import requests  # HTTP 요청 / http requests
from bs4 import BeautifulSoup  # HTML 파싱 / html parsing
import pandas as pd  # 저장/테이블 / storage & tables
from transformers import pipeline  # 요약 파이프라인 / summarization pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device_map="auto")  # 영어 요약기 세팅 / set EN summarizer
SESSION = requests.Session()  # 세션 재사용 / reuse session
SESSION.headers.update({"User-Agent": "EEOC-NewsSummarizer/0.1 (contact: you@example.com)"})  # 예의바른 UA / polite UA
OUT_DIR = "/content/results"; os.makedirs(OUT_DIR, exist_ok=True)  # 결과 폴더 / results dir

# [스텝 2] 기본 유틸 함수들 / Utility functions
def get_soup(url, retries=3, sleep=1.0):
    for i in range(retries):  # 재시도 / retry
        try:
            r = SESSION.get(url, timeout=12)  # 요청 / request
            if r.status_code == 200:  # 성공 시 / on success
                return BeautifulSoup(r.text, "html.parser")  # 파싱 / parse
        except requests.RequestException:
            pass  # 무시하고 재시도 / ignore and retry
        time.sleep(sleep * (i + 1))  # 점진적 대기 / backoff
    return None  # 실패 / fail

def find_next_page_eeoc(current_url):
    m = re.search(r"([?&])page=(\d+)", current_url)  # page 파라미터 찾기 / find page param
    if not m: return None  # 없으면 중단 / none if not found
    num = int(m.group(2)) + 1  # 다음 페이지 번호 / next page number
    return re.sub(r"([?&])page=\d+", fr"\1page={num}", current_url)  # 교체 / replace

def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()  # 공백 정리 / normalize spaces

# [스텝 3] 링크 수집기: EEOC 뉴스룸 최신 N개 / Collect latest N article links from EEOC newsroom
DISALLOWED_FRAGMENTS = ["/admin/", "/user/", "/search/", "/comment/reply/", "/media/oembed", "/index.php/admin/"]  # 금지 경로 / disallow

def is_allowed(url: str) -> bool:
    if not url.startswith("https://www.eeoc.gov/"): return False  # 도메인 제한 / domain restrict
    if any(x in url for x in DISALLOWED_FRAGMENTS): return False  # 금지 경로 제외 / exclude disallowed
    return True  # 허용 / allow

def get_article_links_eeoc(list_url: str, max_pages: int = 5, max_articles: int = 10):
    seen, results, current, pages = set(), [], list_url, 0  # 상태 변수들 / state vars
    while current and pages < max_pages and len(results) < max_articles:  # 한계 내 순회 / iterate within limits
        soup = get_soup(current)  # 페이지 로드 / load page
        if not soup: break  # 실패 시 중단 / stop if fail
        for a in soup.find_all("a", href=True):  # 모든 링크 후보 / all links
            full = requests.compat.urljoin(current, a["href"])  # 절대경로화 / absolutize

            # 뉴스룸 메인(/newsroom) 자체는 제외
            if full.rstrip('/') in ["https://www.eeoc.gov/newsroom"]:
                continue

            # 실제 뉴스 기사 링크만 선택
            if "/newsroom/" in full and is_allowed(full):  # 뉴스룸 상세만 / newsroom detail only
                if full not in seen:  # 중복 제거 / dedupe
                    seen.add(full); results.append(full)  # 추가 / add
                    if len(results) >= max_articles: break  # N개 채우면 종료 / stop at N

        current = find_next_page_eeoc(current)  # 다음 페이지 / next page
        pages += 1  # 페이지 카운트 / count
        time.sleep(1.0)  # 서버 배려 / be gentle
    return results  # 링크 리스트 반환 / return links

# [스텝 4] 기사 본문 추출 / Extract article title & body
def extract_article_eeoc(url: str):
    soup = get_soup(url)  # 로드 / load
    if not soup: return None, None  # 실패 / fail

    title = None  # 제목 초기화 / init
    og = soup.find("meta", property="og:title")  # og:title 우선 / prefer og:title
    if og and og.get("content"): title = og["content"].strip()  # og 제목 / og title
    if not title and soup.title: title = soup.title.get_text(strip=True)  # <title> 폴백 / fallback
    if title and re.match(r"^\s*newsroom(\b|$)", title.strip(), re.I):
        return title, None

    candidates = []  # 본문 후보 / body candidates
    if soup.find("article"): candidates.append(soup.find("article"))  # article 태그 / article tag
    if soup.find(attrs={"role": "main"}): candidates.append(soup.find(attrs={"role": "main"}))  # role=main / main role
    if soup.select_one(".region-content"): candidates.append(soup.select_one(".region-content"))  # Drupal 컨테이너 / container
    if soup.select_one(".field--name-body"): candidates.append(soup.select_one(".field--name-body"))  # 본문 필드 / body field

    target, best = None, -1  # 최적 후보 / best block
    for c in candidates:  # 후보들 순회 / iterate candidates
        if not c: continue  # 빈 후보 스킵 / skip empty
        cnt = len(c.find_all("p"))  # p 개수 / number of p
        if cnt > best: best = cnt; target = c  # 최댓값 선택 / choose max

    if target is None: target = soup  # 최후 폴백 / final fallback
    paras = [p.get_text(" ", strip=True) for p in target.find_all("p")]  # p 텍스트 / p texts
    paras = [p for p in paras if len(p) > 40]  # 너무 짧은 문단 제거 / remove too short
    text = clean_text(" ".join(paras))  # 합치고 정리 / join & clean
    if not text or len(text.split()) < 60: return title, None  # 너무 짧으면 실패 / too short -> fail
    return title, text  # 제목과 본문 / title & body

# [스텝 5] 요약 함수: BART 기본, 3문장 폴백 / Summarize with BART, fallback to 3 sentences
def summarize_text(text: str, max_len=180, min_len=60):
    try:
        out = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False, truncation=True)[0]["summary_text"]  # 요약 / summarize
        out = clean_text(out)  # 정리 / clean
        if len(out.split()) < 25: raise ValueError  # 너무 짧으면 폴백 / too short -> fallback
        return out  # 정상 요약 / ok
    except Exception:
        sents = re.split(r"(?<=[.!?])\s+", text)  # 문장 분리 / split sentences
        return clean_text(" ".join(sents[:3])) if sents else text[:400]  # 앞 3문장 / first 3 sents

# [스텝 6] HTML 렌더링과 CSV 저장 / Render HTML & save CSV
def render_html(rows, outfile):
    head = """<!doctype html><meta charset="utf-8">
    <title>EEOC Auto News Summaries</title>
    <style>
      body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;max-width:960px;margin:40px auto;padding:0 20px}
      .card{border:1px solid #ddd;border-radius:12px;padding:16px;margin:14px 0;box-shadow:0 2px 6px rgba(0,0,0,0.05)}
      a{color:#1a73e8;text-decoration:none}
      h1{margin-bottom:8px}
      small{color:#666}
    </style>"""
    body = "<h1>EEOC Auto News Summaries</h1><small>Source: https://www.eeoc.gov/newsroom</small>"  # 헤더 / header
    for r in rows:  # 각 카드 / each card
        body += f"""
        <div class="card">
          <h2>{html.escape(r['title'] or 'Untitled')}</h2>
          <p><a href="{html.escape(r['article_url'])}" target="_blank">원문 보기 / View source</a></p>
          <p>{html.escape(r['summary'])}</p>
        </div>"""
    with open(outfile, "w", encoding="utf-8") as f: f.write(head + body)  # 파일 저장 / save file

def save_csv(rows, csv_path):
    pd.DataFrame(rows).to_csv(csv_path, index=False, encoding="utf-8-sig")  # CSV 저장 / save csv

# [스텝 7] 전체 파이프라인 실행 / Run the whole pipeline
def run_pipeline_eeoc(start_url: str, max_pages=7, max_articles=50):
    links = get_article_links_eeoc(start_url, max_pages=max_pages, max_articles=max_articles)  # 링크 수집 / collect links
    if not links: print("링크 수집 0건. / No links"); return  # 없으면 종료 / stop if none
    rows = []  # 결과 모음 / results
    for u in links:  # 각 기사 처리 / per-article
        title, text = extract_article_eeoc(u)  # 본문 추출 / extract body
        if not text: continue  # 실패 스킵 / skip if fail
        summary = summarize_text(text)  # 요약 / summarize
        rows.append({
            "date": dt.date.today().isoformat(),  # 날짜 / date
            "source_url": start_url,  # 시작 URL / start url
            "article_url": u,  # 기사 링크 / article url
            "title": title,  # 제목 / title
            "summary": summary,  # 요약 / summary
            "word_count": len(text.split()),  # 단어수 / word count
            "fetched_at": dt.datetime.now().isoformat(timespec="seconds")  # 수집시간 / fetched time
        })
        time.sleep(1.0)
    if not rows: print("본문 추출 0건. / No bodies"); return  # 없으면 종료 / stop if none
    csv_path = f"{OUT_DIR}/eeoc_{dt.date.today().isoformat()}.csv"  # CSV 경로 / csv path
    html_path = f"{OUT_DIR}/eeoc_summary.html"  # HTML 경로 / html path
    save_csv(rows, csv_path)  # CSV 저장 / save csv
    render_html(rows, html_path)  # HTML 저장 / save html
    print(f"완료 CSV → {csv_path}")  # 안내 / info
    print(f"완료 HTML → {html_path}")  # 안내 / info

# [스텝 8] 실행 예시 / Run example
start_url = "https://www.eeoc.gov/newsroom?page=0"  # EEOC 뉴스룸 첫 페이지 / newsroom page
run_pipeline_eeoc(start_url, max_pages=7, max_articles=50)

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


완료 CSV → /content/results/eeoc_2025-10-23.csv
완료 HTML → /content/results/eeoc_summary.html
