In [10]:
import json

parser_config = {
    "techcrunch": {
        "domain": "techcrunch.com",
        "title": "div.article-hero__middle",
        "content": "div.entry-content",
        "images": "div.entry-content img",
        "author": "a.wp-block-tc23-author-card-name__link",
        "time": "time",
        "highlight": "a nofollow",
        "topic": "div.tc23-post-relevant-terms__terms a",
        "references": "div.entry-content a"
    },
    "vnexpress": {
        "domain": "vnexpress.net",
        "title": "h1.title-detail",
        "content": "div.sidebar-1",
        "images": "article.fck_detail img",
        "author": "div.sidebar-1 p.Normal",
        "time": "div.sidebar-1 span.date",
        "highlight": "",
        "topic": "ul.breadcrumb",
        "references": "div.width_common box-tinlienquanv2",
    },
    "techradar": {
        "domain": "techradar.com",
        "title": "div.news-article header h1",
        "content": "div.wcp-item-content p",
        "images": "div.wcp-item-content img",
        "author": "",
        "time": "",
        "highlight": "",
        "topic": "",
        "references": "",
    },
    "vietnamnet": {
        "domain": "https://vietnamnet.vn",
        "title": "div.news-article header h1",
        "content": "div.wcp-item-content p",
        "images": "div.wcp-item-content img",
        "author": "",
        "time": "",
        "highlight": "",
        "topic": "",
        "references": "",
    },
}

with open("parsers.json", "w", encoding="utf-8") as f:
    json.dump(parser_config, f, ensure_ascii=False, indent=4)


In [9]:
parser_links = {
    "techcrunch":{
        "domain": "techcrunch.com",
        "links": "a.loop-card__title-link",
    },
    "vnexpress":{
        "domain": "vnexpress.net",
        "links": "wrapper-topstory-folder flexbox width_common wrapper-topstory-folder-v2 no-border a"
    }
}
with open("parsers_links.json", "w", encoding="utf-8") as f:
    json.dump(parser_links, f, ensure_ascii=False, indent = 4)




In [12]:
import json
import time
import requests
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import schedule

with open("parsers.json", "r", encoding="utf-8") as f:
    PARSERS = json.load(f)

with open("parsers_links.json", "r", encoding="utf-8") as f:
    PARSERS_LINKS = json.load(f)

def get_parser_by_domain(domain):
    for config in PARSERS.values():
        if config["domain"] in domain:
            return config
    return None

def get_parser_link(domain):
    for config in PARSERS_LINKS.values(): 
        if config["domain"] in domain:
            return config
    return None

def get_latest_news(soup, config):
    links_tags = soup.select(config["links"])
    print(links_tags)
    links = [a['href'] for a in links_tags]
    return links

def extract_by_config(soup, config):
    result = {
        "title": "",
        "author": "",
        "time": "",
        "topics": [],
        "content": []
    }
    
    title_tag = soup.select_one(config.get("title", ""))
    if title_tag:
        result["title"] = title_tag.get_text(strip=True)
        
    author_tag = soup.select_one(config.get("author", ""))
    if author_tag:
        result["author"] = author_tag.get_text(strip=True)
        
    time_tag = soup.select_one(config.get("time", ""))
    if time_tag:
        result["time"] = time_tag.get_text(strip=True)
        
    topic_tags = soup.select(config.get("topic", ""))
    if topic_tags:
        result["topics"] = [tag.get_text(strip=True) for tag in topic_tags]
        
    content_container = soup.select_one(config.get("content", ""))
    if content_container:
        all_elements = content_container.find_all(recursive=True)
        for element in all_elements:
            if element.name == "p" and element.get_text(strip=True):
                result["content"].append({"type": "text", "value": element.get_text(strip=True)})
            elif element.name == "img" and element.get("src", ""):
                result["content"].append({"type": "image", "value": element.get("src")})
            elif element.name == "a" and element.get("href", ""):
                result["content"].append({"type": "link", "value": element.get("href")})
                
    return result

def summarize(soup, config):
    try:
        title_tag = soup.select_one(config["title"])
        content_tags = soup.select(config["content"])
        image_tags = soup.select(config["images"])
        author_tag = soup.select_one(config["author"])
        time_tag = soup.select_one(config["time"])
        topics_tag = soup.select(config["topic"])
        references_tags = soup.select(config["references"])
        title = title_tag.text.strip() if title_tag else None
        content = "\n".join(p.text.strip() for p in content_tags if p.text.strip())
        images = [img["src"] for img in image_tags if img.get("src")]
        author = author_tag.text.strip() if author_tag else None
        time = time_tag.text.strip() if time_tag else None
        topics = [topic.get_text(strip=True) for topic in topics_tag]
        references = href_list = [link['href'] for link in references_tags if link.get('href')]
        return {
            "title": title,
            "content": content,
            "images": images,
            "author": author,
            "time": time,
            "topic": topics,
            "references": references,
        }
    except Exception as e:
        print(f"❌ Lỗi khi trích xuất dữ liệu: {e}")
        return None

def parser_seo(soup, config):
    try:
        seo_data = {
            "meta_title": "",
            "meta_description": "",
            "meta_keywords": [],
            "h1": "",
            "h2": [],
            "canonical_url": "",
            "word_count": 0,
            "internal_links": [],
            "external_links": []
        }
        meta_title = soup.find("meta", property="og:title") or soup.find("meta", attrs={"name": "title"})
        if meta_title and meta_title.get("content"):
            seo_data["meta_title"] = meta_title["content"]
        meta_desc = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            seo_data["meta_description"] = meta_desc["content"]
        meta_keywords = soup.find("meta", attrs={"name": "keywords"})
        if meta_keywords and meta_keywords.get("content"):
            seo_data["meta_keywords"] = [kw.strip() for kw in meta_keywords["content"].split(",")]
        h1_tag = soup.find("h1")
        if h1_tag:
            seo_data["h1"] = h1_tag.get_text(strip=True)
        h2_tags = soup.find_all("h2")
        if h2_tags:
            seo_data["h2"] = [h2.get_text(strip=True) for h2 in h2_tags]
        canonical = soup.find("link", rel="canonical")
        if canonical and canonical.get("href"):
            seo_data["canonical_url"] = canonical["href"]
        content_container = soup.select_one(config.get("content", ""))
        if content_container:
            text_content = " ".join(p.get_text(strip=True) for p in content_container.find_all("p"))
            seo_data["word_count"] = len(text_content.split())

        domain = config["domain"]
        links = soup.select(config.get("references", ""))
        for link in links:
            href = link.get("href")
            if href:
                if domain in href or href.startswith("/"):
                    seo_data["internal_links"].append(href)
                else:
                    seo_data["external_links"].append(href)

        return seo_data
    except Exception as e:
        print(f"❌ Lỗi khi phân tích SEO: {e}")
        return None

def get_seo_inf(url):
    try:
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        domain = urlparse(url).netloc
        config = get_parser_by_domain(domain)

        if not config:
            print(f"❌ Không có parser cho domain: {domain}")
            return None

        seo_data = parser_seo(soup, config)
        if not seo_data:
            print(f"⚠️ Không thể thu thập dữ liệu SEO cho: {url}")
            return None
        output_file = "seo_data.json"
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                existing_data = json.load(f)
        except FileNotFoundError:
            existing_data = []

        existing_data.append({
            "url": url,
            "domain": domain,
            "seo_data": seo_data
        })

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        return seo_data
    except Exception as e:
        print(f"❌ Lỗi khi xử lý URL {url}: {e}")
        return None

def process_article(url):
    try:
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        domain = urlparse(url).netloc 
        config = get_parser_by_domain(domain)
        config_links = get_parser_link(domain)
        
        if not config:
            print(f"❌ Không có parser cho domain: {domain}")
            return None

        if not config_links:
            print(f"❌ Không có parser_links cho domain: {domain}")
            return None

        links = get_latest_news(soup, config_links)
        print(links)
        article = extract_by_config(soup, config)
        summarize_article = summarize(soup, config)
        seo_data = parser_seo(soup, config)
        
        if not article or not article["title"] or not article["content"]:
            print(f"⚠️ Bỏ qua vì thiếu tiêu đề hoặc nội dung: {url}")
            return None
            
        return article, summarize_article, seo_data
    except Exception as e:
        print(f"❌ Lỗi khi xử lý URL {url}: {e}")   
        return None

def crawl_multiple_urls():
    try:
        articles = []
        summaries = []
        seo_data_list = []
        domains = [config["domain"] for config in PARSERS.values()]
        for domain in domains:
            config_links = get_parser_link(domain)
            if not config_links:
                print(f"❌ Không có parser_links cho domain: {domain}")
                continue
            options = Options()
            options.headless = True
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(f"https://{domain}")
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            driver.quit()
            links = get_latest_news(soup, config_links)
            for url in links[:2]:
                if url.startswith("/"):
                    url = f"https://{domain}{url}"

                result = process_article(url)
                if result:
                    article, summarize_article, seo_data = result
                    articles.append(article)
                    summaries.append(summarize_article)
                    seo_data_list.append({
                        "url": url,
                        "domain": domain,
                        "seo_data": seo_data
                    })
        with open("articles_test.json", "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=4)
        with open("articles_summarize.json", "w", encoding="utf-8") as f:
            json.dump(summaries, f, ensure_ascii=False, indent=4)
        with open("seo_data.json", "w", encoding="utf-8") as f:
            json.dump(seo_data_list, f, ensure_ascii=False, indent=4)
            
        print(f"✅ Hoàn thành crawl lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
    except Exception as e:
        print(f"❌ Lỗi khi crawl multiple URLs: {e}")

if __name__ == "__main__":
    url = 'https://vnexpress.net/tau-nang-luong-mat-troi-co-the-cho-7-000-chiec-xe-4910902.html'
    # url = url = "https://techcrunch.com/2025/07/07/threads-is-nearing-xs-daily-app-users-new-data-shows/"
    article, summarize_article, seo_data = process_article(url)
    article = [article] if article else []
    summarize_article = [summarize_article] if summarize_article else []
    seo_data = [seo_data] if seo_data else []

    # print(article)
    # print(summarize_article)
    # print(seo_data)
    
    # with open("articles_test.json", "w", encoding="utf-8") as f:
    #     json.dump(article, f, ensure_ascii=False, indent=4)
    # with open("articles_summarize.json", "w", encoding="utf-8") as f:
    #     json.dump(summarize_article, f, ensure_ascii=False, indent=4)
    # with open("seo_data.json", "w", encoding="utf-8") as f:
    #     json.dump(seo_data, f, ensure_ascii=False, indent=4)

[]
[]


In [None]:
import json
import time
import requests
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import schedule

with open("parsers.json", "r", encoding="utf-8") as f:
    PARSERS = json.load(f)

with open("parsers_links.json", "r", encoding="utf-8") as f:
    PARSERS_LINKS = json.load(f)

def get_parser_by_domain(domain):
    for config in PARSERS.values():
        if config["domain"] in domain:
            return config
    return None

def get_parser_link(domain):
    for config in PARSERS_LINKS.values():
        if config["domain"] in domain:
            return config
    return None

def get_latest_news(soup, config):
    links_tags = soup.select(config["links"])
    links = [a['href'] for a in links_tags]
    return links

def extract_by_config(soup, config):
    result = {
        "title": "",
        "author": "",
        "time": "",
        "topics": [],
        "content": []
    }
    
    title_tag = soup.select_one(config.get("title", ""))
    if title_tag:
        result["title"] = title_tag.get_text(strip=True)
        
    author_tag = soup.select_one(config.get("author", ""))
    if author_tag:
        result["author"] = author_tag.get_text(strip=True)
        
    time_tag = soup.select_one(config.get("time", ""))
    if time_tag:
        result["time"] = time_tag.get_text(strip=True)
        
    topic_tags = soup.select(config.get("topic", ""))
    if topic_tags:
        result["topics"] = [tag.get_text(strip=True) for tag in topic_tags]
        
    content_container = soup.select_one(config.get("content", ""))
    if content_container:
        all_elements = content_container.find_all(recursive=True)
        for element in all_elements:
            if element.name == "p" and element.get_text(strip=True):
                result["content"].append({"type": "text", "value": element.get_text(strip=True)})
            elif element.name == "img" and element.get("src", ""):
                result["content"].append({"type": "image", "value": element.get("src")})
            elif element.name == "a" and element.get("href", ""):
                result["content"].append({"type": "link", "value": element.get("href")})
                
    return result

def summarize(soup, config):
    try:
        title_tag = soup.select_one(config["title"])
        content_tags = soup.select(config["content"])
        image_tags = soup.select(config["images"])
        author_tag = soup.select_one(config["author"])
        time_tag = soup.select_one(config["time"])
        topics_tag = soup.select(config["topic"])
        references_tags = soup.select(config["references"])
        title = title_tag.text.strip() if title_tag else None
        content = "\n".join(p.text.strip() for p in content_tags if p.text.strip())
        images = [img["src"] for img in image_tags if img.get("src")]
        author = author_tag.text.strip() if author_tag else None
        time = time_tag.text.strip() if time_tag else None
        topics = [topic.get_text(strip=True) for topic in topics_tag]
        references = href_list = [link['href'] for link in references_tags if link.get('href')]
        return {
            "title": title,
            "content": content,
            "images": images,
            "author": author,
            "time": time,
            "topic": topics,
            "references": references,
        }
    except Exception as e:
        print(f"❌ Lỗi khi trích xuất dữ liệu: {e}")
        return None

def parser_seo(soup, config):
    try:
        seo_data = {
            "meta_title": "",
            "meta_description": "",
            "meta_keywords": [],
            "h1": "",
            "h2": [],
            "canonical_url": "",
            "word_count": 0,
            "internal_links": [],
            "external_links": []
        }
        meta_title = soup.find("meta", property="og:title") or soup.find("meta", attrs={"name": "title"})
        if meta_title and meta_title.get("content"):
            seo_data["meta_title"] = meta_title["content"]
        meta_desc = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            seo_data["meta_description"] = meta_desc["content"]
        meta_keywords = soup.find("meta", attrs={"name": "keywords"})
        if meta_keywords and meta_keywords.get("content"):
            seo_data["meta_keywords"] = [kw.strip() for kw in meta_keywords["content"].split(",")]
        h1_tag = soup.find("h1")
        if h1_tag:
            seo_data["h1"] = h1_tag.get_text(strip=True)
        h2_tags = soup.find_all("h2")
        if h2_tags:
            seo_data["h2"] = [h2.get_text(strip=True) for h2 in h2_tags]
        canonical = soup.find("link", rel="canonical")
        if canonical and canonical.get("href"):
            seo_data["canonical_url"] = canonical["href"]
        content_container = soup.select_one(config.get("content", ""))
        if content_container:
            text_content = " ".join(p.get_text(strip=True) for p in content_container.find_all("p"))
            seo_data["word_count"] = len(text_content.split())

        domain = config["domain"]
        links = soup.select(config.get("references", ""))
        for link in links:
            href = link.get("href")
            if href:
                if domain in href or href.startswith("/"):
                    seo_data["internal_links"].append(href)
                else:
                    seo_data["external_links"].append(href)

        return seo_data
    except Exception as e:
        print(f"❌ Lỗi khi phân tích SEO: {e}")
        return None

def get_seo_inf(url):
    try:
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        domain = urlparse(url).netloc
        config = get_parser_by_domain(domain)

        if not config:
            print(f"❌ Không có parser cho domain: {domain}")
            return None

        seo_data = parser_seo(soup, config)
        if not seo_data:
            print(f"⚠️ Không thể thu thập dữ liệu SEO cho: {url}")
            return None
        output_file = "seo_data.json"
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                existing_data = json.load(f)
        except FileNotFoundError:
            existing_data = []

        existing_data.append({
            "url": url,
            "domain": domain,
            "seo_data": seo_data
        })

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, ensure_ascii=False, indent=4)

        return seo_data
    except Exception as e:
        print(f"❌ Lỗi khi xử lý URL {url}: {e}")
        return None

def process_article(url):
    try:
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        domain = urlparse(url).netloc 
        config = get_parser_by_domain(domain)
        config_links = get_parser_link(domain)
        
        if not config:
            print(f"❌ Không có parser cho domain: {domain}")
            return None

        if not config_links:
            print(f"❌ Không có parser_links cho domain: {domain}")
            return None

        links = get_latest_news(soup, config_links)
        article = extract_by_config(soup, config)
        summarize_article = summarize(soup, config)
        seo_data = parser_seo(soup, config)
        
        if not article or not article["title"] or not article["content"]:
            print(f"⚠️ Bỏ qua vì thiếu tiêu đề hoặc nội dung: {url}")
            return None
            
        return article, summarize_article, seo_data
    except Exception as e:
        print(f"❌ Lỗi khi xử lý URL {url}: {e}")   
        return None

def crawl_multiple_urls():
    try:
        articles = []
        summaries = []
        seo_data_list = []
        domains = [config["domain"] for config in PARSERS.values()]
        for domain in domains:
            config_links = get_parser_link(domain)
            if not config_links:
                print(f"❌ Không có parser_links cho domain: {domain}")
                continue
            options = Options()
            options.headless = True
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(f"https://{domain}")
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            driver.quit()
            links = get_latest_news(soup, config_links)
            for url in links[:2]:
                if url.startswith("/"):
                    url = f"https://{domain}{url}"

                result = process_article(url)
                if result:
                    article, summarize_article, seo_data = result
                    articles.append(article)
                    summaries.append(summarize_article)
                    seo_data_list.append({
                        "url": url,
                        "domain": domain,
                        "seo_data": seo_data
                    })
        with open("articles_test.json", "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=4)
        with open("articles_summarize.json", "w", encoding="utf-8") as f:
            json.dump(summaries, f, ensure_ascii=False, indent=4)
        with open("seo_data.json", "w", encoding="utf-8") as f:
            json.dump(seo_data_list, f, ensure_ascii=False, indent=4)
            
        print(f"✅ Hoàn thành crawl lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
    except Exception as e:
        print(f"❌ Lỗi khi crawl multiple URLs: {e}")

def start_crawling():
    schedule.every(1).minutes.do(crawl_multiple_urls)
    
    print("🚀 Bắt đầu lịch crawl tự động mỗi 1 phút...")
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == "__main__":
    start_crawling()

🚀 Bắt đầu lịch crawl tự động mỗi 1 phút...


In [None]:
import csv
import json
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from transformers import pipeline

with open("parsers.json", "r", encoding="utf-8") as f:
    PARSERS = json.load(f)

def get_parser_by_domain(domain):
    for name, config in PARSERS.items():
        if config["domain"] in domain:
            return config
    return None

def extract_by_config(soup, config):
    title_tag = soup.select_one(config["title"])
    paragraphs = soup.select(config["content"])
    images = [img["src"] for img in soup.select(config["images"]) if img.get("src")]

    title = title_tag.text.strip() if title_tag else None
    content = "\n".join(p.text.strip() for p in paragraphs if p.text.strip())

    return {
        "title": title,
        "content": content,
        "images": images
    }

def process_article(url, summarizer):
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    domain = urlparse(url).netloc
    config = get_parser_by_domain(domain)

    if not config:
        print(f"❌ Không có parser cho domain: {domain}")
        return None

    article = extract_by_config(soup, config)

    if not article["title"] or not article["content"].strip():
        print(f"⚠️ Bỏ qua vì thiếu tiêu đề hoặc nội dung: {url}")
        return None

    content_trimmed = article["content"][:3000].strip()

    try:
        summary = summarizer(content_trimmed, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
    except Exception as e:
        print(f"❌ Lỗi khi tóm tắt bài viết: {url}\n{e}")
        return None

    print("\n📰 URL:", url)
    print("📌 Tiêu đề:", article["title"])
    print("🖼️ Ảnh:", article["images"] if article["images"] else "Không có ảnh")
    print("\n📄 Nội dung:\n", article["content"])
    print("\n🧠 Tóm tắt:\n", summary)

    return {
        "title": article["title"],
        "url": url,
        "summary": summary,
        "content": article["content"],
        "images": ", ".join(article["images"])
    }

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

urls = [
    "https://techcrunch.com/2025/05/13/aws-enters-into-strategic-partnership-with-saudi-arabia-backed-humain/",
    "https://vnexpress.net/hon-150-trieu-dong-cho-cac-startup-tranh-tai-tai-pitchfest-2025-4885506.html",
    "https://www.techradar.com/computing/artificial-intelligence/this-new-chatgpt-feature-solves-the-most-annoying-thing-about-deep-research"
]

results = []
for url in urls:
    result = process_article(url, summarizer)
    if result:
        results.append(result)

with open("articles.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("\n✅ Đã lưu vào file articles.json")
