wikipedia

In [20]:
import requests
from bs4 import BeautifulSoup

def crawl_wikipedia(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Lỗi khi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', {'id': 'bodyContent'})

    if not content_div:
        print("Không tìm thấy nội dung chính.")
        return

    allowed_tags = ['h1', 'h2', 'h3', 'p']

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_div.find_all(allowed_tags):
            # Xoá chú thích trong thẻ <p>
            if tag.name == 'p':
                for sup in tag.find_all('sup'):
                    sup.decompose()

            text = tag.get_text(strip=True)
            if not text:
                continue

            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_wikipedia("https://en.wikipedia.org/wiki/Pittsburgh", "pittsburgh.txt")


In [21]:
url1 = "https://en.wikipedia.org/wiki/Pittsburgh"
url2 = "https://en.wikipedia.org/wiki/History_of_Pittsburgh"
output_file1 = "pittsburgh.txt"
output_file2 = "history_of_pittsburgh.txt"

crawl_wikipedia(url1, output_file1)
crawl_wikipedia(url2, output_file2)

✅ Đã lưu nội dung từ https://en.wikipedia.org/wiki/Pittsburgh vào 'pittsburgh.txt'
✅ Đã lưu nội dung từ https://en.wikipedia.org/wiki/History_of_Pittsburgh vào 'history_of_pittsburgh.txt'


britannica

In [18]:
import requests
from bs4 import BeautifulSoup

def crawl_britannica(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Lỗi khi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Các thẻ muốn giữ lại theo thứ tự xuất hiện
    allowed_tags = ['h1', 'h2', 'h3', 'p']

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in soup.find_all(allowed_tags):
            text = tag.get_text(strip=True)
            if not text:
                continue
            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_britannica("https://www.britannica.com/place/Pittsburgh", "pittsburgh_britannica.txt")


In [19]:
url = "https://www.britannica.com/place/Pittsburgh"
output_file = "pittsburgh_britannica.txt"
crawl_britannica(url, output_file)

✅ Đã lưu nội dung từ https://www.britannica.com/place/Pittsburgh vào 'pittsburgh_britannica.txt'


dynamic web

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

def crawl_dynamic_site(url, output_file):
    options = Options()
    options.add_argument('--headless')  # chạy ẩn
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    service = Service(r"C:\Users\dongh\Downloads\chromedriver-win32\chromedriver-win32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(5)  # chờ trang load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    allowed_tags = ['h1', 'h2', 'h3', 'p']
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in soup.find_all(allowed_tags):
            text = tag.get_text(strip=True)
            if not text:
                continue
            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_dynamic_site("https://www.visitpittsburgh.com/", "visitpittsburgh.txt", driver_path="/path/to/chromedriver")


In [29]:
url = "https://www.visitpittsburgh.com/"
output_file = "Pittsburgh webpage.txt"
crawl_dynamic_site(url=url, output_file=output_file)

✅ Đã lưu nội dung từ https://www.visitpittsburgh.com/ vào 'Pittsburgh webpage.txt'


In [37]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import fitz  # PyMuPDF

def crawl_and_extract(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"❌ Lỗi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', {'class': 'main-container clearfix'})
    if not content_div:
        print("❌ Không tìm thấy vùng nội dung chính.")
        return

    tags = ['h1', 'h2', 'h3', 'h4', 'p', 'a']
    found_regulation = False
    base_path = "pdf_temp"
    os.makedirs(base_path, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_div.find_all(tags):
            if tag.name in ['h1', 'h2', 'h3', 'h4']:
                text = tag.get_text(strip=True)
                if not text:
                    continue
                if tag.name == 'h1':
                    f.write(f"# {text}\n\n")
                elif tag.name == 'h2':
                    f.write(f"## {text}\n\n")
                elif tag.name == 'h3':
                    f.write(f"### {text}\n\n")
                elif tag.name == 'h4':
                    f.write(f"#### {text}\n\n")
                    if 'regulations' in text.lower():
                        found_regulation = True
                    else:
                        found_regulation = False  # reset khi gặp h4 khác

            elif tag.name == 'p':
                text = tag.get_text(strip=True)
                if text:
                    f.write(text + '\n\n')

            elif tag.name == 'a' and found_regulation:
                href = tag.get('href', '')
                if href.endswith('.pdf'):
                    pdf_url = urljoin(url, href)
                    filename = os.path.basename(href)
                    local_pdf = os.path.join(base_path, filename)

                    try:
                        pdf_res = requests.get(pdf_url, headers=headers)
                        with open(local_pdf, 'wb') as pdf_file:
                            pdf_file.write(pdf_res.content)
                        f.write(f"📎 Nội dung từ file PDF: {filename}\n\n")

                        # Trích nội dung PDF
                        doc = fitz.open(local_pdf)
                        for page in doc:
                            text = page.get_text().strip()
                            if text:
                                f.write(text + '\n\n')
                        doc.close()
                        print(f"✅ Đã trích xuất PDF: {filename}")

                    except Exception as e:
                        f.write(f"[Lỗi khi tải hoặc đọc PDF {filename}: {e}]\n\n")

    print(f"\n📄 Đã ghi toàn bộ nội dung vào '{output_file}'")

# Ví dụ sử dụng:
crawl_and_extract(
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Tax-Forms",
    "pittsburgh_tax.txt"
)


✅ Đã trích xuất PDF: 9622_amusement_tax_regulations.pdf
✅ Đã trích xuất PDF: 9626_payroll_tax_regulations.pdf
✅ Đã trích xuất PDF: 9623_isp_tax_regulations.pdf
✅ Đã trích xuất PDF: 9624_local_services_tax_regulations.pdf
✅ Đã trích xuất PDF: 9625_parking_tax_regulations.pdf
✅ Đã trích xuất PDF: 9627_uf_regulations.pdf
✅ Đã trích xuất PDF: change-in-business-status-form-04.2025.pdf
✅ Đã trích xuất PDF: 6492_2636_10_taxpayers_bill_of_rights_4-26-2018.pdf
✅ Đã trích xuất PDF: 16958_2022_tax_rate_by_tax_type.pdf
✅ Đã trích xuất PDF: 16957_2022_tax_due_date_calendar_.pdf
✅ Đã trích xuất PDF: 8271_facility_usage_fee_information_for_performers_and_contracting_parties.pdf
✅ Đã trích xuất PDF: firesale.pdf
✅ Đã trích xuất PDF: 8398_payroll_expense_tax__et__allocation_schedule_form.pdf
✅ Đã trích xuất PDF: 6825_payroll_expense_tax_allocation_schedule_for_professional_organization_form_instructions8.15.19.pdf
✅ Đã trích xuất PDF: 8397_local_services_tax_ls-1_allocation_schedule_form.pdf
✅ Đã tríc

In [41]:
import requests
import fitz  # PyMuPDF
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Ẩn cảnh báo SSL

def extract_text_from_pdf_url(pdf_url: str, output_txt: str = "output.txt"):
    try:
        response = requests.get(pdf_url, verify=False)  # ⚠️ Bỏ qua SSL verify
        response.raise_for_status()

        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        doc = fitz.open("temp.pdf")
        all_text = []

        for page in doc:
            page_text = page.get_text()
            # Giữ nguyên xuống dòng, nhưng loại khoảng trắng thừa
            cleaned_lines = [line.strip() for line in page_text.splitlines() if line.strip()]
            all_text.extend(cleaned_lines)
            all_text.append("")  # Thêm dòng trống giữa các trang (tùy chọn)

        # Ghi ra file, mỗi dòng là 1 dòng văn bản rõ ràng
        with open(output_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(all_text))

        print(f"✅ Đã trích xuất văn bản vào '{output_txt}'")
    except Exception as e:
        print(f"❌ Lỗi: {e}")

extract_text_from_pdf_url(
     "https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf",
     "2024 Operating Budget.txt"
)


✅ Đã trích xuất văn bản vào '2024 Operating Budget.txt'


In [None]:
import requests
from bs4 import BeautifulSoup

url = ["https://www.cmu.edu/about/",
       "https://www.cmu.edu/academics/interdisciplinary-programs.html",
        "https://www.library.cmu.edu/",
        "https://www.cmu.edu/academics/learning-for-a-lifetime.html",
        "https://www.cmu.edu/admission/student-community-blog",
        "https://www.cmu.edu/graduate/prospective/index.html",
        "https://www.cmu.edu/leadership/",
        "https://www.cmu.edu/about/mission.html",
        "https://www.cmu.edu/about/history.html",
        "https://www.cmu.edu/about/traditions.html",
        "https://www.cmu.edu/inclusive-excellence/",
        "https://www.cmu.edu/about/pittsburgh.html",
        "https://www.cmu.edu/about/rankings.html",
        "https://www.cmu.edu/about/awards.html",
        "https://www.cmu.edu/visit//visitor-information",
        "https://www.cmu.edu/research/centers-and-institutes.html",
        "https://www.cmu.edu/student-experience/index.html",
        ]

def crawl_cmu_about():
    url = "https://www.cmu.edu/about/"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Báo lỗi nếu không truy cập được

    soup = BeautifulSoup(response.text, "html.parser")

    # Lấy nội dung chính trong phần <main> hoặc vùng chính
    main_content = soup.find("main")
    if not main_content:
        main_content = soup.body  # fallback nếu không có <main>

    # Loại bỏ script, style, nav,...
    for tag in main_content(["script", "style", "nav", "footer"]):
        tag.decompose()

    text = main_content.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    clean_text = "\n".join(lines)

    # Ghi vào file
    with open("cmu_about.txt", "w", encoding="utf-8") as f:
        f.write(clean_text)

    print("✅ Đã lưu nội dung vào 'cmu_about.txt'")

crawl_cmu_about()


✅ Đã lưu nội dung vào 'cmu_about.txt'


In [48]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

base_url = "https://www.cmu.edu/about/"
visited = set()

def clean_text(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    main = soup.find("main") or soup.body

    if main is None:
        return ""

    # Loại bỏ tag không cần thiết
    for tag in main.find_all(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = main.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)

def save_text(url, text, folder="cmu_about_pages"):
    os.makedirs(folder, exist_ok=True)
    path = urlparse(url).path.strip("/").replace("/", "_")
    filename = "index" if path == "" else path
    with open(f"{folder}/{filename}.txt", "w", encoding="utf-8") as f:
        f.write(text)

def crawl(url):
    if url in visited or not url.startswith(base_url):
        return
    visited.add(url)

    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
    except Exception as e:
        print(f"❌ Lỗi truy cập {url}: {e}")
        return

    text = clean_text(response.text)
    save_text(url, text)
    print(f"✅ Đã lưu: {url}")

    # Tiếp tục crawl các liên kết nội bộ
    soup = BeautifulSoup(response.text, "html.parser")
    for link in soup.find_all("a", href=True):
        href = link["href"]
        full_url = urljoin(url, href)
        if full_url.startswith(base_url):
            crawl(full_url)

crawl(base_url)


✅ Đã lưu: https://www.cmu.edu/about/
✅ Đã lưu: https://www.cmu.edu/about/cmu-fact-sheet.pdf
✅ Đã lưu: https://www.cmu.edu/about/awards.html
✅ Đã lưu: https://www.cmu.edu/about/index.html
✅ Đã lưu: https://www.cmu.edu/about/rankings.html
✅ Đã lưu: https://www.cmu.edu/about/mission.html
✅ Đã lưu: https://www.cmu.edu/about/history.html
✅ Đã lưu: https://www.cmu.edu/about/traditions.html
✅ Đã lưu: https://www.cmu.edu/about/pittsburgh.html
❌ Lỗi truy cập https://www.cmu.edu/about/pittsburgh-old.html: 404 Client Error: Not Found for url: https://www.cmu.edu/about/pittsburgh-old.html
