**DEMO: Crawl Data từ Tech Crunch: Lấy ra các thẻ href là các đường link dẫn tới trang tin tức từ Homepage**

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://techcrunch.com/latest/"

driver.get(url)
driver.implicitly_wait(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
articles = soup.find_all("a", class_="loop-card__title-link")
links = [a['href'] for a in articles]

df = pd.DataFrame(links, columns=["Article URL"])
df.to_csv("techcrunch_latest_links.csv", index=False)

**Crawl Data từ 1 trang web nước ngoài + sumamry**

In [10]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from transformers import pipeline
import time

options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = "https://techcrunch.com/2025/05/13/aws-enters-into-strategic-partnership-with-saudi-arabia-backed-humain/"
driver.get(url)
time.sleep(3)

soup = BeautifulSoup(driver.page_source, 'html.parser')

title_tag = soup.find('h1', class_='article-hero__title')
title = title_tag.text.strip() if title_tag else "Không tìm thấy tiêu đề"

content_block = soup.find('div', class_='entry-content')
paragraphs = content_block.find_all('p') if content_block else []
content = '\n'.join(p.text.strip() for p in paragraphs if p.text.strip())
images = [img['src'] for img in content_block.find_all('img') if img.get('src')] if content_block else []

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
content_trimmed = content[:3000]
summary = summarizer(content, max_length=1000, min_length=50, do_sample=False)[0]['summary_text']

driver.quit()

print("Tiêu đề:", title)
print("URL:", url)
print("Ảnh:", images if images else "Không có ảnh")
print("\n📰 Nội dung bài viết đầy đủ:\n")
print(content)
print("\n📌 TÓM TẮT:\n")
print(summary)

with open('output.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['title', 'url', 'summary', 'content', 'images'])
    writer.writeheader()
    writer.writerow({
        'title': title,
        'url': url,
        'summary': summary,
        'content': content,
        'images': ', '.join(images)
    })

print("\n✅ Đã lưu thông tin vào file output.csv")


Device set to use cpu
Your max_length is set to 1000, but your input_length is only 417. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=208)


Tiêu đề: AWS enters into ‘strategic partnership’ with Saudi Arabia-backed Humain
URL: https://techcrunch.com/2025/05/13/aws-enters-into-strategic-partnership-with-saudi-arabia-backed-humain/
Ảnh: Không có ảnh

📰 Nội dung bài viết đầy đủ:

Amazon says it will work with Humain, the AI company recently launched by Saudi Arabia’s ruler, Mohammed bin Salman, to invest “$5 billion-plus” in a strategic partnership to build an “AI Zone” in Saudi Arabia.
The AI Zone will include dedicated Amazon Web Services (AWS) AI infrastructure, servers, networks, and training and certification programs, according to a press release. Humain is pledging to develop AI solutions using AWS technologies and to work with AWS on providing access to tools and programs for Saudi Arabia-based AI startups.
AWS joins tech giants Nvidia, AMD, and others in partnering with Humain, which is funded by Saudi Arabia’s Public Investment Fund (PIF). American tech firms have looked to the PIF as a source of capital. Companies l

**DEMO: Crawl Data trên nhiều web khác nhau + summary bằng model HuggingFace**

In [19]:
import csv
import json
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from transformers import pipeline

with open("parsers.json", "r", encoding="utf-8") as f:
    PARSERS = json.load(f)

def get_parser_by_domain(domain):
    for name, config in PARSERS.items():
        if config["domain"] in domain:
            return config
    return None

def extract_by_config(soup, config):
    title_tag = soup.select_one(config["title"])
    paragraphs = soup.select(config["content"])
    images = [img["src"] for img in soup.select(config["images"]) if img.get("src")]

    title = title_tag.text.strip() if title_tag else None
    content = "\n".join(p.text.strip() for p in paragraphs if p.text.strip())

    return {
        "title": title,
        "content": content,
        "images": images
    }

def process_article(url, summarizer):
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    domain = urlparse(url).netloc
    config = get_parser_by_domain(domain)

    if not config:
        print(f"❌ Không có parser cho domain: {domain}")
        return None

    article = extract_by_config(soup, config)

    if not article["title"] or not article["content"].strip():
        print(f"⚠️ Bỏ qua vì thiếu tiêu đề hoặc nội dung: {url}")
        return None

    content_trimmed = article["content"][:3000].strip()

    try:
        summary = summarizer(content_trimmed, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
    except Exception as e:
        print(f"❌ Lỗi khi tóm tắt bài viết: {url}\n{e}")
        return None

    print("\n📰 URL:", url)
    print("📌 Tiêu đề:", article["title"])
    print("🖼️ Ảnh:", article["images"] if article["images"] else "Không có ảnh")
    print("\n📄 Nội dung:\n", article["content"])
    print("\n🧠 Tóm tắt:\n", summary)

    return {
        "title": article["title"],
        "url": url,
        "summary": summary,
        "content": article["content"],
        "images": ", ".join(article["images"])
    }

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

urls = [
    "https://techcrunch.com/2025/05/13/aws-enters-into-strategic-partnership-with-saudi-arabia-backed-humain/",
    "https://vnexpress.net/hon-150-trieu-dong-cho-cac-startup-tranh-tai-tai-pitchfest-2025-4885506.html",
    "https://www.techradar.com/computing/artificial-intelligence/this-new-chatgpt-feature-solves-the-most-annoying-thing-about-deep-research"
]

results = []
for url in urls:
    result = process_article(url, summarizer)
    if result:
        results.append(result)

with open("articles.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("\n✅ Đã lưu vào file articles.json")


Device set to use cpu



📰 URL: https://techcrunch.com/2025/05/13/aws-enters-into-strategic-partnership-with-saudi-arabia-backed-humain/
📌 Tiêu đề: AWS enters into ‘strategic partnership’ with Saudi Arabia-backed Humain
🖼️ Ảnh: Không có ảnh

📄 Nội dung:
 Amazon says it will work with Humain, the AI company recently launched by Saudi Arabia’s ruler, Mohammed bin Salman, to invest “$5 billion-plus” in a strategic partnership to build an “AI Zone” in Saudi Arabia.
The AI Zone will include dedicated Amazon Web Services (AWS) AI infrastructure, servers, networks, and training and certification programs, according to a press release. Humain is pledging to develop AI solutions using AWS technologies and to work with AWS on providing access to tools and programs for Saudi Arabia-based AI startups.
AWS joins tech giants Nvidia, AMD, and others in partnering with Humain, which is funded by Saudi Arabia’s Public Investment Fund (PIF). American tech firms have looked to the PIF as a source of capital. Companies like Goog

In [17]:
import json

parser_config = {
    "techcrunch": {
        "domain": "techcrunch.com",
        "title": "h1.article-hero__title",
        "content": "div.entry-content p",
        "images": "div.entry-content img"
    },
    "vnexpress": {
        "domain": "vnexpress.net",
        "title": "h1.title-detail",
        "content": "article.fck_detail p",
        "images": "article.fck_detail img"
    },
    "techradar": {
        "domain": "techradar.com",
        "title": "div.news-article header h1",
        "content": "div.wcp-item-content p",
        "images": "div.wcp-item-content img"
    }
}

with open("parsers.json", "w", encoding="utf-8") as f:
    json.dump(parser_config, f, ensure_ascii=False, indent=4)


**Demo: Crawl Data và summary bằng API Gemini**

In [33]:
import json
import time
import requests
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup


GEMINI_API_KEY = "AIzaSyA6vrOzFkbeSgn0Ixh2EItOOwCxKFsQqgU"

def summarize_with_gemini(content):
    prompt = (
        "Bạn là một trợ lý AI chuyên tóm tắt tin tức. "
        "Hãy đọc đoạn nội dung sau và tóm tắt lại đầy đủ, rõ ràng các ý chính"
        "Hãy viết dễ hiểu, súc tích và giữ đúng tinh thần bài viết.\n\n"
        f"{content.strip()}"
    )

    body = {
        "contents": [{
            "parts": [{"text": prompt}]
        }]
    }

    # url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key=AIzaSyA6vrOzFkbeSgn0Ixh2EItOOwCxKFsQqgU"
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=AIzaSyA6vrOzFkbeSgn0Ixh2EItOOwCxKFsQqgU"

    headers = {"Content-Type": "application/json"}

    try:
        response = requests.post(url, headers=headers, json=body)
        response.raise_for_status()
        result = response.json()
        return result["candidates"][0]["content"]["parts"][0]["text"]
    except Exception as e:
        return None

with open("parsers.json", "r", encoding="utf-8") as f:
    PARSERS = json.load(f)

def get_parser_by_domain(domain):
    for config in PARSERS.values():
        if config["domain"] in domain:
            return config
    return None

def extract_by_config(soup, config):
    title_tag = soup.select_one(config["title"])
    content_tags = soup.select(config["content"])
    image_tags = soup.select(config["images"])

    title = title_tag.text.strip() if title_tag else None
    content = "\n".join(p.text.strip() for p in content_tags if p.text.strip())
    images = [img["src"] for img in image_tags if img.get("src")]

    return {
        "title": title,
        "content": content,
        "images": images
    }

def process_article(url):
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    domain = urlparse(url).netloc
    config = get_parser_by_domain(domain)

    if not config:
        print(f"❌ Không có parser cho domain: {domain}")
        return None

    article = extract_by_config(soup, config)
    if not article["title"] or not article["content"]:
        print(f"⚠️ Bỏ qua vì thiếu tiêu đề hoặc nội dung: {url}")
        return None

    summary = summarize_with_gemini(article["content"][:5000])
    if not summary:
        return None

    print(f"\n📰 URL: {url}")
    print(f"📌 Tiêu đề: {article['title']}")
    # print(f"Nội dung:{article['content']}")
    print(f"🧠 Tóm tắt:\n{summary}\n")

    return {
        "title": article["title"],
        "url": url,
        "summary": summary,
        "content": article["content"],
        "images": ", ".join(article["images"])
    }

if __name__ == "__main__":
    urls = [
        "https://vnexpress.net/hon-150-trieu-dong-cho-cac-startup-tranh-tai-tai-pitchfest-2025-4885506.html",
        "https://www.techradar.com/computing/artificial-intelligence/this-new-chatgpt-feature-solves-the-most-annoying-thing-about-deep-research",
        "https://techcrunch.com/2025/05/13/attend-techcrunch-sessions-ai-with-this-new-limited-time-discount/"
    ]
    urls1 = ["https://techcrunch.com/2025/05/13/anthropic-google-score-win-by-nabbing-openai-backed-harvey-as-a-user/"]

    results = []
    for url in urls:
        result = process_article(url)
        if result:
            results.append(result)

    with open("articles.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print("\n✅ Đã lưu xong vào file articles.json")



📰 URL: https://vnexpress.net/hon-150-trieu-dong-cho-cac-startup-tranh-tai-tai-pitchfest-2025-4885506.html
📌 Tiêu đề: Hơn 150 triệu đồng cho các startup tranh tài tại PitchFest 2025
🧠 Tóm tắt:
Cuộc thi PitchFest, hoạt động trọng tâm của Tuần lễ Blockchain & AI Super Vietnam 2025 (4-5/6 tại Đà Nẵng), tìm kiếm các sáng kiến Web3, blockchain và AI. Tổng giải thưởng tiền mặt 50 triệu đồng chia cho Top 5, kèm gói truyền thông 100 triệu đồng và cơ hội tiếp cận hơn 30 quỹ đầu tư. Cuộc thi gồm 3 vòng: đăng ký, bình chọn (20-27/5 - Top 10 vào chung kết) và chung kết (4/6 - pitching 10 phút). Điều kiện tham gia: có MVP thuộc Web3/AI, tăng trưởng người dùng, đội ngũ kinh nghiệm, vốn gọi dưới 2 triệu USD. Super Vietnam 2025 do Orochi Network, FPT Online và DSAC tổ chức, VnExpress bảo trợ, dự kiến thu hút 7.000 khách, bao gồm hội nghị, triển lãm, kết nối giao thương và các hoạt động khác. Orochi Network, đơn vị phát triển hạ tầng kiểm chứng dữ liệu, có kinh nghiệm trong lĩnh vực an ninh mạng và hợp