wikipedia

In [20]:
import requests
from bs4 import BeautifulSoup

def crawl_wikipedia(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Lỗi khi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', {'id': 'bodyContent'})

    if not content_div:
        print("Không tìm thấy nội dung chính.")
        return

    allowed_tags = ['h1', 'h2', 'h3', 'p']

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_div.find_all(allowed_tags):
            # Xoá chú thích trong thẻ <p>
            if tag.name == 'p':
                for sup in tag.find_all('sup'):
                    sup.decompose()

            text = tag.get_text(strip=True)
            if not text:
                continue

            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_wikipedia("https://en.wikipedia.org/wiki/Pittsburgh", "pittsburgh.txt")


In [21]:
url1 = "https://en.wikipedia.org/wiki/Pittsburgh"
url2 = "https://en.wikipedia.org/wiki/History_of_Pittsburgh"
output_file1 = "pittsburgh.txt"
output_file2 = "history_of_pittsburgh.txt"

crawl_wikipedia(url1, output_file1)
crawl_wikipedia(url2, output_file2)

✅ Đã lưu nội dung từ https://en.wikipedia.org/wiki/Pittsburgh vào 'pittsburgh.txt'
✅ Đã lưu nội dung từ https://en.wikipedia.org/wiki/History_of_Pittsburgh vào 'history_of_pittsburgh.txt'


britannica

In [18]:
import requests
from bs4 import BeautifulSoup

def crawl_britannica(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Lỗi khi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Các thẻ muốn giữ lại theo thứ tự xuất hiện
    allowed_tags = ['h1', 'h2', 'h3', 'p']

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in soup.find_all(allowed_tags):
            text = tag.get_text(strip=True)
            if not text:
                continue
            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_britannica("https://www.britannica.com/place/Pittsburgh", "pittsburgh_britannica.txt")


In [19]:
url = "https://www.britannica.com/place/Pittsburgh"
output_file = "pittsburgh_britannica.txt"
crawl_britannica(url, output_file)

✅ Đã lưu nội dung từ https://www.britannica.com/place/Pittsburgh vào 'pittsburgh_britannica.txt'


dynamic web

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

def crawl_dynamic_site(url, output_file):
    options = Options()
    options.add_argument('--headless')  # chạy ẩn
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    service = Service(r"C:\Users\dongh\Downloads\chromedriver-win32\chromedriver-win32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(5)  # chờ trang load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    allowed_tags = ['h1', 'h2', 'h3', 'p']
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in soup.find_all(allowed_tags):
            text = tag.get_text(strip=True)
            if not text:
                continue
            if tag.name == 'h1':
                f.write(f"# {text}\n\n")
            elif tag.name == 'h2':
                f.write(f"## {text}\n\n")
            elif tag.name == 'h3':
                f.write(f"### {text}\n\n")
            else:
                f.write(text + '\n\n')

    print(f"✅ Đã lưu nội dung từ {url} vào '{output_file}'")

# Ví dụ sử dụng:
# crawl_dynamic_site("https://www.visitpittsburgh.com/", "visitpittsburgh.txt", driver_path="/path/to/chromedriver")


In [29]:
url = "https://www.visitpittsburgh.com/"
output_file = "Pittsburgh webpage.txt"
crawl_dynamic_site(url=url, output_file=output_file)

✅ Đã lưu nội dung từ https://www.visitpittsburgh.com/ vào 'Pittsburgh webpage.txt'


In [37]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import fitz  # PyMuPDF

def crawl_and_extract(url, output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"❌ Lỗi truy cập {url}: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', {'class': 'main-container clearfix'})
    if not content_div:
        print("❌ Không tìm thấy vùng nội dung chính.")
        return

    tags = ['h1', 'h2', 'h3', 'h4', 'p', 'a']
    found_regulation = False
    base_path = "pdf_temp"
    os.makedirs(base_path, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        for tag in content_div.find_all(tags):
            if tag.name in ['h1', 'h2', 'h3', 'h4']:
                text = tag.get_text(strip=True)
                if not text:
                    continue
                if tag.name == 'h1':
                    f.write(f"# {text}\n\n")
                elif tag.name == 'h2':
                    f.write(f"## {text}\n\n")
                elif tag.name == 'h3':
                    f.write(f"### {text}\n\n")
                elif tag.name == 'h4':
                    f.write(f"#### {text}\n\n")
                    if 'regulations' in text.lower():
                        found_regulation = True
                    else:
                        found_regulation = False  # reset khi gặp h4 khác

            elif tag.name == 'p':
                text = tag.get_text(strip=True)
                if text:
                    f.write(text + '\n\n')

            elif tag.name == 'a' and found_regulation:
                href = tag.get('href', '')
                if href.endswith('.pdf'):
                    pdf_url = urljoin(url, href)
                    filename = os.path.basename(href)
                    local_pdf = os.path.join(base_path, filename)

                    try:
                        pdf_res = requests.get(pdf_url, headers=headers)
                        with open(local_pdf, 'wb') as pdf_file:
                            pdf_file.write(pdf_res.content)
                        f.write(f"📎 Nội dung từ file PDF: {filename}\n\n")

                        # Trích nội dung PDF
                        doc = fitz.open(local_pdf)
                        for page in doc:
                            text = page.get_text().strip()
                            if text:
                                f.write(text + '\n\n')
                        doc.close()
                        print(f"✅ Đã trích xuất PDF: {filename}")

                    except Exception as e:
                        f.write(f"[Lỗi khi tải hoặc đọc PDF {filename}: {e}]\n\n")

    print(f"\n📄 Đã ghi toàn bộ nội dung vào '{output_file}'")

# Ví dụ sử dụng:
crawl_and_extract(
    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Tax-Forms",
    "pittsburgh_tax.txt"
)


✅ Đã trích xuất PDF: 9622_amusement_tax_regulations.pdf
✅ Đã trích xuất PDF: 9626_payroll_tax_regulations.pdf
✅ Đã trích xuất PDF: 9623_isp_tax_regulations.pdf
✅ Đã trích xuất PDF: 9624_local_services_tax_regulations.pdf
✅ Đã trích xuất PDF: 9625_parking_tax_regulations.pdf
✅ Đã trích xuất PDF: 9627_uf_regulations.pdf
✅ Đã trích xuất PDF: change-in-business-status-form-04.2025.pdf
✅ Đã trích xuất PDF: 6492_2636_10_taxpayers_bill_of_rights_4-26-2018.pdf
✅ Đã trích xuất PDF: 16958_2022_tax_rate_by_tax_type.pdf
✅ Đã trích xuất PDF: 16957_2022_tax_due_date_calendar_.pdf
✅ Đã trích xuất PDF: 8271_facility_usage_fee_information_for_performers_and_contracting_parties.pdf
✅ Đã trích xuất PDF: firesale.pdf
✅ Đã trích xuất PDF: 8398_payroll_expense_tax__et__allocation_schedule_form.pdf
✅ Đã trích xuất PDF: 6825_payroll_expense_tax_allocation_schedule_for_professional_organization_form_instructions8.15.19.pdf
✅ Đã trích xuất PDF: 8397_local_services_tax_ls-1_allocation_schedule_form.pdf
✅ Đã tríc

In [41]:
import requests
import fitz  # PyMuPDF
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)  # Ẩn cảnh báo SSL

def extract_text_from_pdf_url(pdf_url: str, output_txt: str = "output.txt"):
    try:
        response = requests.get(pdf_url, verify=False)  # ⚠️ Bỏ qua SSL verify
        response.raise_for_status()

        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        doc = fitz.open("temp.pdf")
        all_text = []

        for page in doc:
            page_text = page.get_text()
            # Giữ nguyên xuống dòng, nhưng loại khoảng trắng thừa
            cleaned_lines = [line.strip() for line in page_text.splitlines() if line.strip()]
            all_text.extend(cleaned_lines)
            all_text.append("")  # Thêm dòng trống giữa các trang (tùy chọn)

        # Ghi ra file, mỗi dòng là 1 dòng văn bản rõ ràng
        with open(output_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(all_text))

        print(f"✅ Đã trích xuất văn bản vào '{output_txt}'")
    except Exception as e:
        print(f"❌ Lỗi: {e}")

extract_text_from_pdf_url(
     "https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf",
     "2024 Operating Budget.txt"
)


✅ Đã trích xuất văn bản vào '2024 Operating Budget.txt'


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re

urls = [
    "https://www.cmu.edu/about/",
    "https://www.cmu.edu/academics/interdisciplinary-programs.html",
    "https://www.library.cmu.edu/",
    "https://www.cmu.edu/academics/learning-for-a-lifetime.html",
    "https://www.cmu.edu/admission/student-community-blog",
    "https://www.cmu.edu/graduate/prospective/index.html",
    "https://www.cmu.edu/leadership/",
    "https://www.cmu.edu/about/mission.html",
    "https://www.cmu.edu/about/history.html",
    "https://www.cmu.edu/about/traditions.html",
    "https://www.cmu.edu/inclusive-excellence/",
    "https://www.cmu.edu/about/pittsburgh.html",
    "https://www.cmu.edu/about/rankings.html",
    "https://www.cmu.edu/about/awards.html",
    "https://www.cmu.edu/visit//visitor-information",
    "https://www.cmu.edu/research/centers-and-institutes.html",
    "https://www.cmu.edu/student-experience/index.html",
]

def extract_header(url):
    # Lấy phần sau cùng của URL để làm tiêu đề
    parsed = urlparse(url)
    path = parsed.path.strip("/").split("/")[-1]
    if not path or path == "index.html":
        path = parsed.path.strip("/").split("/")[-2] if len(parsed.path.strip("/").split("/")) > 1 else "home"
    path = re.sub(r'\.html$', '', path)
    return "#" + path.lower().replace(" ", "-")

def clean_text(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    main = soup.find("main") or soup.body

    if main is None:
        return ""

    for tag in main.find_all(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = main.get_text(separator="\n")
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)

def crawl_and_write(urls, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        for url in urls:
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                print(f"✅ Crawled: {url}")
            except Exception as e:
                print(f"❌ Lỗi truy cập {url}: {e}")
                continue

            header = extract_header(url)
            text = clean_text(response.text)

            if text:
                f.write(header + "\n")
                f.write(text + "\n\n")
            else:
                print(f"⚠️ Không tìm thấy nội dung ở {url}")

# Gọi hàm crawl
crawl_and_write(urls, "cmu_about.txt")


✅ Crawled: https://www.cmu.edu/about/
✅ Crawled: https://www.cmu.edu/academics/interdisciplinary-programs.html
✅ Crawled: https://www.library.cmu.edu/
✅ Crawled: https://www.cmu.edu/academics/learning-for-a-lifetime.html
✅ Crawled: https://www.cmu.edu/admission/student-community-blog
✅ Crawled: https://www.cmu.edu/graduate/prospective/index.html
✅ Crawled: https://www.cmu.edu/leadership/
✅ Crawled: https://www.cmu.edu/about/mission.html
✅ Crawled: https://www.cmu.edu/about/history.html
✅ Crawled: https://www.cmu.edu/about/traditions.html
✅ Crawled: https://www.cmu.edu/inclusive-excellence/
✅ Crawled: https://www.cmu.edu/about/pittsburgh.html
✅ Crawled: https://www.cmu.edu/about/rankings.html
✅ Crawled: https://www.cmu.edu/about/awards.html
✅ Crawled: https://www.cmu.edu/visit//visitor-information
✅ Crawled: https://www.cmu.edu/research/centers-and-institutes.html
✅ Crawled: https://www.cmu.edu/student-experience/index.html


In [25]:
import requests
from bs4 import BeautifulSoup

def crawl_all_events(output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}

    with open(output_file, 'w', encoding='utf-8') as f:
        for page in range(1, 32):  # từ 1 đến 31
            url = f"https://downtownpittsburgh.com/events/?n=5&d={page}&y=2025"
            print(f"🔍 Đang crawl trang {page}...")

            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"❌ Lỗi truy cập {url}: {response.status_code}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            h1_tags = soup.find_all('h1')

            if not h1_tags:
                print(f"⚠️ Không tìm thấy sự kiện trên trang {page}")
                continue

            for h1 in h1_tags:
                link_tag = h1.find('a')
                if not link_tag:
                    continue
                title = link_tag.get_text(strip=True)
                href = link_tag.get('href', '')
                full_url = 'https://downtownpittsburgh.com' + href

                sibling = h1.next_sibling
                date_text = None
                description_parts = []

                while sibling:
                    if not hasattr(sibling, 'get_text'):
                        sibling = sibling.next_sibling
                        continue

                    text = sibling.get_text(strip=True)
                    if text:
                        if not date_text and ('am' in text.lower() or 'pm' in text.lower() or '-' in text):
                            date_text = text
                        elif text == "READ MORE":
                            break
                        else:
                            description_parts.append(text)
                    sibling = sibling.next_sibling

                if not date_text:
                    date_text = "No date"
                description = " ".join(description_parts).strip() if description_parts else "No description"

                f.write(f"## [{title}]({full_url})\n")
                f.write(f"**Date & Time**: {date_text}\n")
                f.write(f"**Description**: {description}\n\n")

    print(f"\n✅ Đã lưu tất cả sự kiện vào '{output_file}'")


crawl_all_events("Downtown Pittsburgh events calendar.txt")


🔍 Đang crawl trang 1...
🔍 Đang crawl trang 2...
🔍 Đang crawl trang 3...
🔍 Đang crawl trang 4...
🔍 Đang crawl trang 5...
🔍 Đang crawl trang 6...
🔍 Đang crawl trang 7...
🔍 Đang crawl trang 8...
🔍 Đang crawl trang 9...
🔍 Đang crawl trang 10...
🔍 Đang crawl trang 11...
🔍 Đang crawl trang 12...
🔍 Đang crawl trang 13...
🔍 Đang crawl trang 14...
🔍 Đang crawl trang 15...
🔍 Đang crawl trang 16...
🔍 Đang crawl trang 17...
🔍 Đang crawl trang 18...
🔍 Đang crawl trang 19...
🔍 Đang crawl trang 20...
🔍 Đang crawl trang 21...
🔍 Đang crawl trang 22...
🔍 Đang crawl trang 23...
🔍 Đang crawl trang 24...
🔍 Đang crawl trang 25...
🔍 Đang crawl trang 26...
🔍 Đang crawl trang 27...
🔍 Đang crawl trang 28...
🔍 Đang crawl trang 29...
🔍 Đang crawl trang 30...
🔍 Đang crawl trang 31...

✅ Đã lưu tất cả sự kiện vào 'Downtown Pittsburgh events calendar.txt'


In [29]:
import requests
from bs4 import BeautifulSoup

def crawl_citypaper_detailed(output_file):
    headers = {'User-Agent': 'Mozilla/5.0'}

    with open(output_file, 'w', encoding='utf-8') as f:
        for page in range(1, 11):
            url = f"https://www.pghcitypaper.com/pittsburgh/EventSearch?page={page}&v=d"
            print(f"🔎 Đang xử lý trang {page}...")

            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"❌ Lỗi truy cập {url}: {response.status_code}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            events = soup.find_all('p', class_='fdn-teaser-headline')

            for headline in events:
                # Lấy tiêu đề và link
                a_tag = headline.find('a', href=True)
                if not a_tag:
                    continue
                title = a_tag.get_text(strip=True)
                link = a_tag['href']
                if not link.startswith('http'):
                    link = 'https://www.pghcitypaper.com' + link

                # Tìm các phần còn lại thông qua cha của headline
                parent = headline.find_parent()
                date_tag = parent.find_next('p', class_='fdn-teaser-subheadline')
                date = date_tag.get_text(strip=True) if date_tag else "No date"

                loc_tag = parent.find_next('a', class_='fdn-event-teaser-location-link')
                location = loc_tag.get_text(strip=True) if loc_tag else "No location"

                addr_tag = parent.find_next('p', class_='fdn-inline-split-list')
                address = addr_tag.get_text(strip=True) if addr_tag else "No address"

                desc_tag = parent.find_next('div', class_='fdn-teaser-description')
                desc = desc_tag.get_text(strip=True) if desc_tag else "No description"

                # Ghi ra file
                f.write(f"## [{title}]({link})\n")
                f.write(f"**Date & Time**: {date}\n")
                f.write(f"**Location**: {location}, {address}\n")
                f.write(f"**Description**: {desc}\n\n")

    print(f"\n✅ Đã lưu sự kiện vào '{output_file}'")

# Gọi hàm:
crawl_citypaper_detailed("citypaper_events.txt")


🔎 Đang xử lý trang 1...
🔎 Đang xử lý trang 2...
🔎 Đang xử lý trang 3...
🔎 Đang xử lý trang 4...
🔎 Đang xử lý trang 5...
🔎 Đang xử lý trang 6...
🔎 Đang xử lý trang 7...
🔎 Đang xử lý trang 8...
🔎 Đang xử lý trang 9...
🔎 Đang xử lý trang 10...

✅ Đã lưu sự kiện vào 'citypaper_events.txt'


In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time

def crawl_cmu_events_selenium(start_date, end_date, output_file):
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # không mở cửa sổ trình duyệt
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    current_date = start_date

    with open(output_file, 'w', encoding='utf-8') as f:
        while current_date <= end_date:
            date_str = current_date.strftime('%Y%m%d')
            url = f"https://events.cmu.edu/day/date/{date_str}"
            print(f"🔎 Đang xử lý: {url}")

            driver.get(url)
            time.sleep(2)  # đợi trang load JS xong

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            events = soup.find_all('div', class_='lw_cal_event_info')

            if not events:
                current_date += timedelta(days=1)
                continue

            for event in events:
                title_tag = event.find('div', class_='lw_events_title')
                title = title_tag.a.get_text(strip=True) if title_tag and title_tag.a else "Không có tiêu đề"
                link = title_tag.a['href'] if title_tag and title_tag.a else "#"
                if not link.startswith('http'):
                    link = 'https://events.cmu.edu' + link

                location = event.find('div', class_='lw_events_location')
                time_tag = event.find('div', class_='lw_events_time')
                summary = event.find('div', class_='lw_events_summary')

                f.write(f"## [{title}]({link})\n")
                f.write(f"**Date**: {current_date.strftime('%Y-%m-%d')}\n")
                f.write(f"**Time**: {time_tag.get_text(strip=True) if time_tag else 'Không rõ'}\n")
                f.write(f"**Location**: {location.get_text(strip=True) if location else 'Không rõ'}\n")
                f.write(f"**Summary**: {summary.get_text(strip=True) if summary else 'Không có'}\n\n")

            current_date += timedelta(days=1)

    driver.quit()
    print(f"\n✅ Đã lưu tất cả sự kiện vào '{output_file}'")

# Gọi hàm
start = datetime(2025, 4, 1)
end = datetime(2025, 5, 13)
crawl_cmu_events_selenium(start, end, "cmu_events.txt")


🔎 Đang xử lý: https://events.cmu.edu/day/date/20250401
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250402
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250403
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250404
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250405
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250406
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250407
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250408
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250409
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250410
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250411
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250412
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250413
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250414
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250415
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250416
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250417
🔎 Đang xử lý: https://events.cmu.edu/day/date/20250418
🔎 Đang xử 

In [15]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    # Loại bỏ ký tự đặc biệt và khoảng trắng thừa
    return re.sub(r'\s+', ' ', text.strip())

def crawl_events():
    url = "https://www.visitpittsburgh.com/events-festivals/?page=11"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Gửi yêu cầu GET đến trang web
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Phân tích HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Tìm tất cả các sự kiện
        events = soup.find_all('div', class_='card card--common card--listing')
        
        # Mở file để lưu dữ liệu
        with open('pittsburgh_events.txt', 'w', encoding='utf-8') as f:
            for event in events:
                # Lấy tiêu đề
                title_elem = event.find('a', class_='card_heading')
                title = clean_text(title_elem.text) if title_elem else 'N/A'
                
                # Lấy ngày
                date_elem = event.find('p', class_='date-heading card_date-heading')
                date = clean_text(date_elem.text) if date_elem else 'N/A'
                
                # Lấy địa chỉ
                address_elem = event.find('div', class_='card____address')
                address = clean_text(address_elem.text) if address_elem else 'N/A'
                
                # Lấy số điện thoại
                phone_elem = event.find('div', class_='card____phone')
                phone = clean_text(phone_elem.text) if phone_elem else 'N/A'
                
                # Ghi vào file
                f.write(f"Title: {title}\n")
                f.write(f"Date: {date}\n")
                f.write(f"Address: {address}\n")
                f.write(f"Phone: {phone}\n")
                f.write("-" * 50 + "\n")
                
        print("Dữ liệu đã được lưu vào pittsburgh_events.txt")
        
    except requests.RequestException as e:
        print(f"Lỗi khi truy cập trang web: {e}")
    except Exception as e:
        print(f"Lỗi: {e}")

if __name__ == "__main__":
    crawl_events()

Dữ liệu đã được lưu vào pittsburgh_events.txt
