In [1]:
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup

In [9]:
# 建立儲存 PDF 的資料夾
download_dir = "roadmaps_pdfs"
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# 62 個 roadmap.sh 角色名稱（英文名稱，與 PDF 檔名對應）
roadmap_names = [
    "frontend", "backend", "devops", "full-stack",
    "ai-engineer", "ai-agents", "ai-red-teaming", "ai-data-scientist", "data-analyst", "mlops", "product-manager",
    "engineering-manager", "qa", "ios", "android", "flutter", "react", "react-native", "vue",
    "angular", "nodejs", "php", "python", "java", "cpp", "golang", "rust", "typescript", "javascript", "spring-boot",
    "graphql", "git-github", "docker", "kubernetes", "aws", "cloudflare", "linux", "terraform", "postgresql-dba", "sql",
    "mongodb", "redis", "blockchain", "aspnet-core", "system-design", "software-design-architecture", "api-design",
    "datastructures-and-algorithms", "ux-design", "design-system", "technical-writer", "devrel", "cyber-security",
    "prompt-engineering", "game-developer", "server-side-game-developer"
]

# 組合 PDF 下載連結
base_pdf_url = "https://roadmap.sh/pdfs/roadmaps/{}.pdf"
pdf_urls = [base_pdf_url.format(name) for name in roadmap_names]

# 下載 PDF，並於每次下載後隨機延遲 2~5 秒
for url in pdf_urls:
    filename = url.split("/")[-1]
    filepath = os.path.join(download_dir, filename)
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(r.content)
            print(f"下載完成: {filename}")
        else:
            print(f"下載失敗: {url} (狀態碼: {r.status_code})")
    except Exception as e:
        print(f"下載錯誤: {url} (錯誤: {str(e)})")
    delay = random.uniform(2, 5)
    print(f"等待 {delay:.2f} 秒...")
    time.sleep(delay)

下載完成: frontend.pdf
等待 4.04 秒...
下載完成: backend.pdf
等待 3.52 秒...
下載完成: devops.pdf
等待 3.49 秒...
下載完成: full-stack.pdf
等待 4.31 秒...
下載完成: ai-engineer.pdf
等待 2.57 秒...
下載完成: ai-agents.pdf
等待 3.25 秒...
下載完成: ai-red-teaming.pdf
等待 3.32 秒...
下載完成: ai-data-scientist.pdf
等待 3.19 秒...
下載完成: data-analyst.pdf
等待 2.34 秒...
下載完成: mlops.pdf
等待 3.64 秒...
下載完成: product-manager.pdf
等待 2.18 秒...
下載完成: engineering-manager.pdf
等待 3.79 秒...
下載完成: qa.pdf
等待 3.75 秒...
下載完成: ios.pdf
等待 3.85 秒...
下載完成: android.pdf
等待 2.72 秒...
下載完成: flutter.pdf
等待 3.51 秒...
下載完成: react.pdf
等待 3.97 秒...
下載完成: react-native.pdf
等待 2.38 秒...
下載完成: vue.pdf
等待 2.85 秒...
下載完成: angular.pdf
等待 2.30 秒...
下載完成: nodejs.pdf
等待 3.96 秒...
下載完成: php.pdf
等待 4.88 秒...
下載完成: python.pdf
等待 4.54 秒...
下載完成: java.pdf
等待 4.47 秒...
下載完成: cpp.pdf
等待 2.98 秒...
下載完成: golang.pdf
等待 2.59 秒...
下載完成: rust.pdf
等待 4.45 秒...
下載完成: typescript.pdf
等待 2.59 秒...
下載完成: javascript.pdf
等待 3.72 秒...
下載完成: spring-boot.pdf
等待 4.85 秒...
下載完成: graphql.pdf
等待 3.76 秒...
下載完成: g

In [None]:
!pip install torch pdfplumber transformers
!pip install torch --upgrade
!pip install hf_xet

In [22]:
print(torch.__version__)

2.7.1+cpu


In [1]:
import os
import pdfplumber
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 設定相對路徑
pdf_folder = os.path.join('.', 'roadmaps_pdfs')
output_folder = os.path.join('.', 'pdf_summaries')
os.makedirs(output_folder, exist_ok=True)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + '\n'
    if len(text) > 3500:
        text = text[:3500]
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    summary_text = summary[0]['summary_text']
    summary_filename = os.path.splitext(pdf_file)[0] + '_summary.txt'
    with open(os.path.join(output_folder, summary_filename), 'w', encoding='utf-8') as f:
        f.write(summary_text)
    print(f"{pdf_file} 已完成摘要。")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


ai-agents.pdf 已完成摘要。
ai-data-scientist.pdf 已完成摘要。
ai-engineer.pdf 已完成摘要。


KeyboardInterrupt: 