## Kenh 14

In [2]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
from google.colab import drive
drive.mount('/content/drive')

# thi·∫øt l·∫≠p ƒë∆∞·ªùng d·∫´n file
def setup_path():
    """Thi·∫øt l·∫≠p ƒë∆∞·ªùng d·∫´n l∆∞u file CSV."""
    output_path = "/content/drive/MyDrive/Scrapper_Kenh14_update.csv"
    return output_path

# crawl d·ªØ li·ªáu trang b√°o
def fetch_main_page():
  """fetch the main page (kenh 14)"""
  url = 'http://kenh14.vn'
  try:
    response = requests.get(url, timeout = 5)
    response.raise_for_status()
    return response.text, url
  except requests.exceptions.RequestException as e:
    print ({f"Error matching main page: {e}"})
    return None, url

# L·∫•y danh s√°ch chuy√™n m·ª•c
def fetch_menu_links(url, headers):
    """L·∫•y danh s√°ch chuy√™n m·ª•c t·ª´ trang K√™nh 14."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    menu = soup.find("ul", class_="kbh-menu-list clearfix fl")
    menu_links = {}

    if menu:
        for item in menu.find_all("li"):
            link_tag = item.find("a")
            if link_tag:
                text = link_tag.get_text(strip=True)
                link = link_tag["href"]
                if link.startswith("/"):
                    link = url + link.lstrip("/")
                menu_links[text] = link
    return menu_links

# L·∫•y b√†i vi·∫øt c·ªßa t·ª´ng chuy√™n m·ª•c
def crawl_category_page(category_name, category_url, headers, url):
    """Tr√≠ch xu·∫•t danh s√°ch b√†i vi·∫øt t·ª´ chuy√™n m·ª•c."""
    print(f"ƒêang crawl: {category_name} - {category_url}")
    article_data = []

    if category_url.startswith("http"):
        response = requests.get(category_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("h3", class_="knswli-title")

        for article in articles:
            a_tag = article.find("a")
            if a_tag:
                article_id = a_tag.get("newsid", None)
                title = a_tag.get_text(strip=True)
                href = a_tag["href"]
                if href.startswith("/"):
                    href = url + href.lstrip("/")
                article_data.append({
                    "ID": article_id,
                    "Title": title,
                    "Href": href,
                    "Category": category_name
                })
    else:
        print(f" - B·ªè qua URL kh√¥ng h·ª£p l·ªá: {category_url}")

    return article_data

# L·∫•y n·ªôi dung c·ªßa t·ª´ng b√†i vi·∫øt trong chuy√™n m·ª•c
def extract_article_content(df, headers):
    """L·∫•y n·ªôi dung b√†i vi·∫øt v√† th·ªùi gian ƒëƒÉng b√†i."""
    df["Content"] = None
    df["Time"] = None

    for index, row in df.iterrows():
        article_url = row['Href']
        try:
            response = requests.get(article_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            # L·∫•y n·ªôi dung b√†i vi·∫øt
            content_div = soup.find("div", class_="detail-content afcbc-body")
            content = content_div.get_text(separator="\n", strip=True) if content_div else "Kh√¥ng t√¨m th·∫•y n·ªôi dung"

            # L·∫•y th·ªùi gian ƒëƒÉng b√†i
            time_div = soup.find("div", class_="kbwc-meta")
            time_span = time_div.find("span", class_="kbwcm-time") if time_div else None
            time = time_span.get_text(strip=True) if time_span else "Kh√¥ng t√¨m th·∫•y th·ªùi gian"

            df.at[index, "Content"] = content
            df.at[index, "Time"] = time

        except requests.exceptions.RequestException as e:
            print(f"‚ùå L·ªói khi l·∫•y {article_url}: {e}")

    return df

def main():
    """H√†m ch√≠nh ƒë·ªÉ ch·∫°y to√†n b·ªô qu√° tr√¨nh."""
    output_path = setup_path()
    url = "https://kenh14.vn/"
    headers = {"User-Agent": "Mozilla/5.0"}

    # L·∫•y danh s√°ch chuy√™n m·ª•c
    menu_links = fetch_menu_links(url, headers)

    # Crawl d·ªØ li·ªáu t·ª´ t·ª´ng chuy√™n m·ª•c
    all_articles = []
    for category, link in menu_links.items():
        articles = crawl_category_page(category, link, headers, url)
        all_articles.extend(articles)

    # Chuy·ªÉn d·ªØ li·ªáu th√†nh DataFrame
    df = pd.DataFrame(all_articles)

    # L·∫•y n·ªôi dung v√† th·ªùi gian ƒëƒÉng b√†i
    df = extract_article_content(df, headers)

    # L∆∞u v√†o file CSV
    file_exists = os.path.exists(output_path)
    df.to_csv(output_path, mode="a", header=not file_exists, index=False)

    print("‚úÖ ƒê√£ l∆∞u d·ªØ li·ªáu v√†o:", output_path)
    return df

if __name__ == "__main__":
    data_kenh14 = main()
    if data_kenh14 is not None:
        print(data_kenh14.head(5))

Mounted at /content/drive
ƒêang crawl: TRANG CH·ª¶ - https://kenh14.vn/
ƒêang crawl: Star - https://kenh14.vn/star.chn
ƒêang crawl: Cin√© - https://kenh14.vn/cine.chn
ƒêang crawl: Musik - https://kenh14.vn/musik.chn
ƒêang crawl: Beauty & Fashion - https://kenh14.vn/beauty-fashion.chn
ƒêang crawl: ƒê·ªùi s·ªëng - https://kenh14.vn/doi-song.chn
ƒêang crawl: Money-Z - https://kenh14.vn/money-z.chn
ƒêang crawl: ƒÇn - Qu·∫©y - ƒêi - https://kenh14.vn/an-quay-di.chn
ƒêang crawl: X√£ h·ªôi - https://kenh14.vn/xa-hoi.chn
ƒêang crawl: S·ª©c kh·ªèe - https://kenh14.vn/suc-khoe.chn
ƒêang crawl: Tek-life - https://kenh14.vn/tek-life.chn
ƒêang crawl: H·ªçc ƒë∆∞·ªùng - https://kenh14.vn/hoc-duong.chn
ƒêang crawl: Xem Mua Lu√¥n - https://kenh14.vn/xem-mua-luon.chn
ƒêang crawl: Video - http://video.kenh14.vn/
ƒêang crawl:  - javascript:;
 - B·ªè qua URL kh√¥ng h·ª£p l·ªá: javascript:;
ƒêang crawl: Nh√¢n v·∫≠t - https://kenh14.vn/hoc-duong/nhan-vat.chn
ƒêang crawl: Xem-ƒÇn-Ch∆°i - https://kenh14.vn/xem

In [3]:
data_kenh14

Unnamed: 0,ID,Title,Href,Category,Content,Time
0,215250613121224562,Ta l√† ai trong th·ªùi ƒë·∫°i A.I?,https://kenh14.vn/ta-la-ai-trong-thoi-dai-ai-2...,TRANG CH·ª¶,Kh√¥ng t√¨m th·∫•y n·ªôi dung,Kh√¥ng t√¨m th·∫•y th·ªùi gian
1,,"ƒêƒÉng quang Hoa h·∫≠u Vi·ªát Nam, H√† Tr√∫c Linh s·∫Ω k...",https://kenh14.vn/dang-quang-hoa-hau-viet-nam-...,Star,Kh√¥ng t√¨m th·∫•y n·ªôi dung,14:28 08/07/2025
2,,T√¢n hoa h·∫≠u Vi·ªát v·ª´a ƒëƒÉng quang ƒë√£ v∆∞·ªõng tranh...,https://kenh14.vn/tan-hoa-hau-viet-vua-dang-qu...,Star,"Ch·ªâ trong t·∫ßm 1 th√°ng g·∫ßn ƒë√¢y, showbiz Vi·ªát c√≥...",14:22 08/07/2025
3,,Hai nh√≥c t·ª≥ nh√† Ph∆∞∆°ng Oanh - shark B√¨nh khi·∫øn...,https://kenh14.vn/hai-nhoc-ty-nha-phuong-oanh-...,Star,"Sau th·ªùi gian k√≠n ti·∫øng v·ªÅ ƒë·ªùi t∆∞, di·ªÖn vi√™n ƒê...",14:00 08/07/2025
4,,D√¢u tr∆∞·ªüng b·∫•t tr·ªã c·ªßa Beckham gi·ªù g√¢y chi·∫øn ƒë...,https://kenh14.vn/dau-truong-bat-tri-cua-beckh...,Star,M·ªëi quan h·ªá gi·ªØa v·ª£ ch·ªìng\nBrooklyn Beckham\n-...,12:34 08/07/2025
...,...,...,...,...,...,...
344,,Ba th√≥i quen ‚Äúph√° m·∫°ch m√°u‚Äù nhanh h∆°n ƒÉn th·ªãt ...,https://kenh14.vn/ba-thoi-quen-pha-mach-mau-nh...,C√°c b·ªánh,M·∫°ch m√°u l√† m·ªôt ph·∫ßn quan tr·ªçng c·ªßa h·ªá tu·∫ßn ho...,10:57 07/07/2025
345,,"Sau khi ƒÉn m√≥n kho√°i kh·∫©u, ng∆∞·ªùi ƒë√†n √¥ng 73 tu...",https://kenh14.vn/sau-khi-an-mon-khoai-khau-ng...,C√°c b·ªánh,M·ªôt v·ª• ng·ªô ƒë·ªôc hi h·ªØu v·ª´a x·∫£y ra t·∫°i th√†nh ph·ªë...,10:32 07/07/2025
346,,Nam k·ªπ s∆∞ tr·∫ª ƒëi kh√°m t√¢m th·∫ßn v√¨ tim ƒë·∫≠p nhan...,https://kenh14.vn/nam-ky-su-tre-di-kham-tam-th...,C√°c b·ªánh,"G·∫ßn nƒÉm nay, nam thanh ni√™n li√™n t·ª•c g·∫∑p c√°c t...",09:36 07/07/2025
347,,ƒê·ª´ng ƒÉn 3 th·ª±c ph·∫©m n√†y m·ªói ng√†y k·∫ªo t·ªïn th∆∞∆°n...,https://kenh14.vn/dung-an-3-thuc-pham-nay-moi-...,C√°c b·ªánh,N·ªìng ƒë·ªô axit uric cao kh√¥ng ch·ªâ g√¢y ra b·ªánh g√∫...,17:06 06/07/2025


### KNN

### LogisticRegression

## B√°o tu·ªïi tr·∫ª

In [6]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
from google.colab import drive
drive.mount('/content/drive')

def setup_path():
    """Set up file path for output"""
    ouput_path = "/content/drive/MyDrive/Scrapper_baotuoitre_update.csv"
    return ouput_path

def fetch_main_page():
    """fetch main page form baotuoitre"""
    base_url = "https://tuoitre.vn/"
    try:
      response = requests.get(base_url, timeout=5)
      response.raise_for_status()
      return response.text, base_url
    except requests.exceptions.RequestException as e:
      print ({f"Error fetching main page: {e}"})
      return None, base_url

from urllib.parse import urljoin

def fetch_menu_links(url, headers):
    """L·∫•y danh s√°ch chuy√™n m·ª•c t·ª´ trang B√°o tu·ªïi tr·∫ª."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    menu = soup.find("ul", class_="menu-nav")
    menu_links = {}

    if menu:
        for item in menu.find_all("li"):
            link_tag = item.find("a")
            if link_tag:
                text = link_tag.get_text(strip=True)
                link = link_tag["href"]
                # Assign link to href directly instead of using a separate variable
                href = urljoin(url, link)
                menu_links[text] = href  # Use href, which has the joined URL
    return menu_links

def crawl_category_page(category_name, category_url, headers, url):
    """Tr√≠ch xu·∫•t danh s√°ch b√†i vi·∫øt t·ª´ chuy√™n m·ª•c."""
    print(f"üì∞ ƒêang crawl: {category_name} - {category_url}")
    article_data = []

    if category_url.startswith("http"):
        try:
            response = requests.get(category_url, headers=headers)
            response.raise_for_status()
        except Exception as e:
            print(f"‚ùå Kh√¥ng th·ªÉ t·∫£i trang {category_url}: {e}")
            return article_data

        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("h3", class_="box-title-text")

        for article in articles:
            a_tag = article.find("a")
            if a_tag:
                article_id = a_tag.get("newsid")
                if not article_id:
                    article_id = hash(a_tag["href"])

                title = a_tag.get_text(strip=True)
                href = a_tag["href"]
                href = urljoin(url, href)

                article_data.append({
                    "ID": str(article_id),
                    "Title": title,
                    "Href": href,
                    "Category": category_name  # ‚úÖ Lu√¥n th√™m
                })

        print(f"  ‚Üí T√¨m th·∫•y {len(article_data)} b√†i vi·∫øt")
    else:
        print(f"‚ö†Ô∏è B·ªè qua URL kh√¥ng h·ª£p l·ªá: {category_url}")

    return article_data


# L·∫•y n·ªôi dung c·ªßa t·ª´ng b√†i vi·∫øt trong chuy√™n m·ª•c
def extract_article_content(df, headers):
    """L·∫•y n·ªôi dung b√†i vi·∫øt v√† th·ªùi gian ƒëƒÉng b√†i."""
    df["Content"] = None
    df["Time"] = None

    for index, row in df.iterrows():
        article_url = row['Href']
        try:
            response = requests.get(article_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            # L·∫•y n·ªôi dung b√†i vi·∫øt
            content_div = soup.find("div", class_="detail-content afcbc-body")
            content = content_div.get_text(separator="\n", strip=True) if content_div else "Kh√¥ng t√¨m th·∫•y n·ªôi dung"

            # L·∫•y th·ªùi gian ƒëƒÉng b√†i
            time_div = soup.find("div", class_="detail-time")
            time = time_div.get_text(strip = True) if time_div else "Kh√¥ng t√¨m th·∫•y th·ªùi gian"

            df.at[index, "Content"] = content
            df.at[index, "Time"] = time

        except requests.exceptions.RequestException as e:
            print(f"‚ùå L·ªói khi l·∫•y {article_url}: {e}")

    return df

def main():
    """H√†m ch√≠nh ƒë·ªÉ ch·∫°y to√†n b·ªô qu√° tr√¨nh."""
    output_path = setup_path()
    url = "https://tuoitre.vn/" # Define 'url' here
    headers = {"User-Agent": "Mozilla/5.0"}

    # L·∫•y danh s√°ch chuy√™n m·ª•c
    menu_links = fetch_menu_links(url, headers)

    # Crawl d·ªØ li·ªáu t·ª´ t·ª´ng chuy√™n m·ª•c
    all_articles = []
    for category, link in menu_links.items():
        articles = crawl_category_page(category, link, headers, url) # Pass 'url' to the function
        all_articles.extend(articles)

    # Chuy·ªÉn d·ªØ li·ªáu th√†nh DataFrame
    df = pd.DataFrame(all_articles)
    df["ID"] = df["ID"].astype(str)

    # L·∫•y n·ªôi dung v√† th·ªùi gian ƒëƒÉng b√†i
    df = extract_article_content(df, headers)

    # L∆∞u v√†o file CSV
    file_exists = os.path.exists(output_path)
    df.to_csv(output_path, mode="a", header=not file_exists, index=False)

    print("‚úÖ ƒê√£ l∆∞u d·ªØ li·ªáu v√†o:", output_path)
    return df

if __name__ == "__main__":
    data_baotuoitre = main()
    if data_baotuoitre is not None:
        print(data_baotuoitre.head(5))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üì∞ ƒêang crawl:  - https://tuoitre.vn/
  ‚Üí T√¨m th·∫•y 109 b√†i vi·∫øt
üì∞ ƒêang crawl: Video - https://tuoitre.vn/video.htm
  ‚Üí T√¨m th·∫•y 74 b√†i vi·∫øt
üì∞ ƒêang crawl: Th·ªùi s·ª± - https://tuoitre.vn/thoi-su.htm
  ‚Üí T√¨m th·∫•y 20 b√†i vi·∫øt
üì∞ ƒêang crawl: Th·∫ø gi·ªõi - https://tuoitre.vn/the-gioi.htm
  ‚Üí T√¨m th·∫•y 20 b√†i vi·∫øt
üì∞ ƒêang crawl: Ph√°p lu·∫≠t - https://tuoitre.vn/phap-luat.htm
  ‚Üí T√¨m th·∫•y 17 b√†i vi·∫øt
üì∞ ƒêang crawl: Kinh doanh - https://tuoitre.vn/kinh-doanh.htm
  ‚Üí T√¨m th·∫•y 20 b√†i vi·∫øt
üì∞ ƒêang crawl: C√¥ng ngh·ªá - https://tuoitre.vn/cong-nghe.htm
  ‚Üí T√¨m th·∫•y 29 b√†i vi·∫øt
üì∞ ƒêang crawl: Xe - https://tuoitre.vn/xe.htm
  ‚Üí T√¨m th·∫•y 29 b√†i vi·∫øt
üì∞ ƒêang crawl: Du l·ªãch - https://tuoitre.vn/du-lich.htm
  ‚Üí T√¨m th·∫•y 20 b√†i vi·∫øt
üì∞ ƒêang crawl: Nh·ªãp s·ªëng tr·∫ª - ht

In [7]:
data_baotuoitre.head(10)

Unnamed: 0,ID,Title,Href,Category,Content,Time
0,-2260756287483343458,TƒÉng tr∆∞·ªüng t√≠n d·ª•ng c·∫£ n∆∞·ªõc g·∫ßn 10%: cao nh·∫•t...,https://tuoitre.vn/tang-truong-tin-dung-ca-nuo...,,Ph√≥ th·ªëng ƒë·ªëc Ng√¢n h√†ng Nh√† n∆∞·ªõc Ph·∫°m Thanh H√†...,08/07/2025 15:15 GMT+7
1,682725038049155215,Lo·∫°t xe ƒëi·ªán ch·ªü kh√°ch du l·ªãch b·ªã thi√™u r·ª•i tr...,https://tuoitre.vn/loat-xe-dien-cho-khach-du-l...,,Ng·ªçn l·ª≠a b·ªëc ch√°y d·ªØ d·ªôi t·∫°i nh√† ƒë·ªÉ xe ƒëi·ªán - ...,08/07/2025 15:12 GMT+7
2,4992644287564071236,T√°c d·ª•ng ƒë·∫∑c bi·ªát √≠t ai bi·∫øt v·ªÅ c·ªß khoai lang,https://tuoitre.vn/tac-dung-dac-biet-it-ai-bie...,,Khoai lang c√≥ nhi·ªÅu t√°c d·ª•ng ƒë·∫∑c bi·ªát - ·∫¢nh mi...,08/07/2025 15:16 GMT+7
3,1068133380818326384,L≈© l·ª•t kinh ho√†ng ·ªü Texas b∆∞·ªõc sang ng√†y th·ª© 5...,https://tuoitre.vn/lu-lut-kinh-hoang-o-texas-b...,,H√¨nh ·∫£nh m·ª±c n∆∞·ªõc con s√¥ng Guadalupe ·ªü h·∫°t Ker...,08/07/2025 15:20 GMT+7
4,1159546553898035122,"C·∫ßn th·ªß v√†o m√πa sƒÉn c√° c√≥c tr√™n s√¥ng H·∫≠u, t·ªânh...",https://tuoitre.vn/can-thu-vao-mua-san-ca-coc-...,,Anh Nguy·ªÖn Thanh B√¨nh c√πng s·ªë c√° c√≥c c√¢u ƒë∆∞·ª£c ...,08/07/2025 15:19 GMT+7
5,1125808989975766984,ƒê∆∞·ªùng d√¢y t·ªëng ti·ªÅn b·∫±ng h√¨nh ·∫£nh nh·∫°y c·∫£m c·ªßa...,https://tuoitre.vn/duong-day-tong-tien-bang-hi...,,Nh·ªØng ng∆∞·ªùi li√™n quan ƒë·∫øn ƒë∆∞·ªùng d√¢y t·ªëng ti·ªÅn ...,08/07/2025 13:53 GMT+7
6,-2142646541881351185,"Ch√°y nhi·ªÅu ph√≤ng tr·ªç ·ªü ph∆∞·ªùng T√¢n Th·ªõi Hi·ªáp, T...",https://tuoitre.vn/chay-nhieu-phong-tro-o-phuo...,,C·∫£nh s√°t ph√≤ng ch√°y ch·ªØa ch√°y ƒë·∫øn hi·ªán tr∆∞·ªùng ...,08/07/2025 11:11 GMT+7
7,6128318560092971323,"Th√°i Lan, Nh·∫≠t B·∫£n v√† nhi·ªÅu n∆∞·ªõc l√™n ti·∫øng v·ªÅ ...",https://tuoitre.vn/thai-lan-nhat-ban-va-nhieu-...,,T·ªïng th·ªëng Donald Trump ng√†y 6-7 - ·∫¢nh: AFP\n...,08/07/2025 13:42 GMT+7
8,-3099726701672378505,B√≠ th∆∞ Th√†nh ·ªßy TP.HCM Nguy·ªÖn VƒÉn N√™n vi·∫øng c√°...,https://tuoitre.vn/bi-thu-thanh-uy-tp-hcm-nguy...,,"B√≠ th∆∞ Nguy·ªÖn VƒÉn N√™n chia s·∫ª, ƒë·ªông vi√™n gia ƒë...",08/07/2025 11:47 GMT+7
9,-3774541804536343103,Kh√¥ng d·ª± ƒë√°m tang nh∆∞ng Cristiano Ronaldo ƒë√£ ƒë...,https://tuoitre.vn/khong-du-dam-tang-nhung-cri...,,H√¨nh ·∫£nh cho th·∫•y c·∫ßu th·ªß Cristiano Ronaldo vi...,08/07/2025 09:22 GMT+7


In [8]:
data_baotuoitre['Category'] = data_baotuoitre['Category'].replace('', 'Trang ch·ªß')
data_baotuoitre

Unnamed: 0,ID,Title,Href,Category,Content,Time
0,-2260756287483343458,TƒÉng tr∆∞·ªüng t√≠n d·ª•ng c·∫£ n∆∞·ªõc g·∫ßn 10%: cao nh·∫•t...,https://tuoitre.vn/tang-truong-tin-dung-ca-nuo...,Trang ch·ªß,Ph√≥ th·ªëng ƒë·ªëc Ng√¢n h√†ng Nh√† n∆∞·ªõc Ph·∫°m Thanh H√†...,08/07/2025 15:15 GMT+7
1,682725038049155215,Lo·∫°t xe ƒëi·ªán ch·ªü kh√°ch du l·ªãch b·ªã thi√™u r·ª•i tr...,https://tuoitre.vn/loat-xe-dien-cho-khach-du-l...,Trang ch·ªß,Ng·ªçn l·ª≠a b·ªëc ch√°y d·ªØ d·ªôi t·∫°i nh√† ƒë·ªÉ xe ƒëi·ªán - ...,08/07/2025 15:12 GMT+7
2,4992644287564071236,T√°c d·ª•ng ƒë·∫∑c bi·ªát √≠t ai bi·∫øt v·ªÅ c·ªß khoai lang,https://tuoitre.vn/tac-dung-dac-biet-it-ai-bie...,Trang ch·ªß,Khoai lang c√≥ nhi·ªÅu t√°c d·ª•ng ƒë·∫∑c bi·ªát - ·∫¢nh mi...,08/07/2025 15:16 GMT+7
3,1068133380818326384,L≈© l·ª•t kinh ho√†ng ·ªü Texas b∆∞·ªõc sang ng√†y th·ª© 5...,https://tuoitre.vn/lu-lut-kinh-hoang-o-texas-b...,Trang ch·ªß,H√¨nh ·∫£nh m·ª±c n∆∞·ªõc con s√¥ng Guadalupe ·ªü h·∫°t Ker...,08/07/2025 15:20 GMT+7
4,1159546553898035122,"C·∫ßn th·ªß v√†o m√πa sƒÉn c√° c√≥c tr√™n s√¥ng H·∫≠u, t·ªânh...",https://tuoitre.vn/can-thu-vao-mua-san-ca-coc-...,Trang ch·ªß,Anh Nguy·ªÖn Thanh B√¨nh c√πng s·ªë c√° c√≥c c√¢u ƒë∆∞·ª£c ...,08/07/2025 15:19 GMT+7
...,...,...,...,...,...,...
552,-5994450664317358316,"B√£o s·ªë 2 m·∫°nh l√™n c·∫•p 9, gi·∫≠t c·∫•p 11",https://tuoitre.vn/bao-so-2-manh-len-cap-9-gia...,TH·ªúI TI·∫æT,D·ª± b√°o v·ªã tr√≠ v√† h∆∞·ªõng di chuy·ªÉn b√£o s·ªë 2 l√∫c ...,05/07/2025 11:40 GMT+7
553,-1966929887162257846,Th·ªùi ti·∫øt ƒë√™m 7 ng√†y 8-7: B·∫Øc B·ªô v√† Trung B·ªô n...,https://tuoitre.vn/video/thoi-tiet-dem-7-ngay-...,TH·ªúI TI·∫æT,,07/07/2025
554,-7442233498374739812,Th·ªùi ti·∫øt ƒë√™m 6 ng√†y 7-7: B√£o s·ªë 2 s∆∞ÃÅc gioÃÅ m...,https://tuoitre.vn/video/thoi-tiet-dem-6-ngay-...,TH·ªúI TI·∫æT,,06/07/2025
555,3663470927944687233,Th·ªùi ti·∫øt ƒë√™m 5 ng√†y 6-7: B√£o s·ªë 2 ∆°Ãâ bƒÉÃÅc Bi√™...,https://tuoitre.vn/video/thoi-tiet-dem-5-ngay-...,TH·ªúI TI·∫æT,,05/07/2025


## VnExppress

In [9]:
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
from google.colab import drive
import time

drive.mount('/content/drive')

def setup_path():
    """Thi·∫øt l·∫≠p ƒë∆∞·ªùng d·∫´n l∆∞u file CSV."""
    output_path = "/content/drive/MyDrive/Scrapper_vnexpress_update.csv"
    return output_path

def fetch_main_page():
    """Fetch the main page (VnExpress)"""
    url = 'https://vnexpress.net/'
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text, url
    except requests.exceptions.RequestException as e:
        print(f"Error fetching main page: {e}")
        return None, url

def fetch_menu_links(url, headers):
    """L·∫•y danh s√°ch chuy√™n m·ª•c t·ª´ trang VnExpress."""
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        menu_links = {}

        # T√¨m menu ch√≠nh - c·∫≠p nh·∫≠t selector theo c·∫•u tr√∫c hi·ªán t·∫°i
        main_menu = soup.find("nav", class_="main-nav") or soup.find("div", class_="menu-main")

        if main_menu:
            # L·∫•y c√°c m·ª•c menu ch√≠nh
            for item in main_menu.find_all("a", class_="parent", recursive=True):
                text = item.get_text(strip=True)
                link = item.get("href", "")
                if link.startswith("/"):
                    link = url + link.lstrip("/")
                if text and link:
                    menu_links[text] = link

        # N·∫øu kh√¥ng t√¨m th·∫•y menu ch√≠nh, th·ª≠ c√°ch kh√°c
        if not menu_links:
            for item in soup.select(".main-nav a, .menu-main a"):
                text = item.get_text(strip=True)
                link = item.get("href", "")
                if link.startswith("/"):
                    link = url + link.lstrip("/")
                if text and link and text not in menu_links:
                    menu_links[text] = link

        return menu_links

    except Exception as e:
        print(f"Error fetching menu links: {e}")
        return {}

def crawl_category_page(category_name, category_url, headers, url):
    """Tr√≠ch xu·∫•t danh s√°ch b√†i vi·∫øt t·ª´ chuy√™n m·ª•c."""
    print(f"ƒêang crawl: {category_name} - {category_url}")
    article_data = []

    try:
        if not category_url.startswith("http"):
            print(f" - B·ªè qua URL kh√¥ng h·ª£p l·ªá: {category_url}")
            return article_data

        response = requests.get(category_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("article.item-news, .list-news-subfolder .item-news, .title-news a")

        for article in articles:
            a_tag = article.find("a") if article.name != "a" else article
            if a_tag:
                title = a_tag.get("title", "") or a_tag.get_text(strip=True)
                href = a_tag.get("href", "")

                if href.startswith("/"):
                    href = url + href.lstrip("/")

                # L·∫•y ID b√†i vi·∫øt t·ª´ URL ho·∫∑c data attribute
                article_id = href.split("-")[-1].split(".")[0] if href else ""

                if title and href:
                    article_data.append({
                        "ID": article_id,
                        "Title": title,
                        "Href": href,
                        "Category": category_name
                    })

        # Gi·ªõi h·∫°n s·ªë b√†i vi·∫øt ƒë·ªÉ tr√°nh qu√° t·∫£i
        return article_data[:50]

    except Exception as e:
        print(f"Error crawling category {category_name}: {e}")
        return article_data

def extract_article_content(df, headers):
    """L·∫•y n·ªôi dung b√†i vi·∫øt v√† th·ªùi gian ƒëƒÉng b√†i."""
    df["Content"] = None
    df["Time"] = None

    for index, row in df.iterrows():
        try:
            url_href = row["Href"]
            response = requests.get(url_href, headers=headers, timeout=10)
            soup_href = BeautifulSoup(response.text, "html.parser")

            # L·∫•y n·ªôi dung b√†i vi·∫øt
            content_div = soup_href.find("article", class_="fck_detail") or soup_href.find("div", class_="content-detail")
            paragraphs = content_div.find_all("p") if content_div else []
            content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

            # L·∫•y th·ªùi gian ƒëƒÉng b√†i
            time_element = soup_href.find("span", class_="date") or soup_href.find("span", class_="time")
            article_time = time_element.get_text(strip=True) if time_element else "Kh√¥ng t√¨m th·∫•y th·ªùi gian"

            # C·∫≠p nh·∫≠t DataFrame
            df.at[index, "Time"] = article_time
            df.at[index, "Content"] = content

            # Tr√°nh request qu√° nhanh
            time.sleep(1)

        except Exception as e:
            print(f"Error scraping article {row['Href']}: {e}")

    return df

def main():
    """H√†m ch√≠nh ƒë·ªÉ ch·∫°y to√†n b·ªô qu√° tr√¨nh."""
    output_path = setup_path()
    url = "https://vnexpress.net/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7"
    }

    # L·∫•y danh s√°ch chuy√™n m·ª•c
    print("ƒêang l·∫•y danh s√°ch chuy√™n m·ª•c...")
    menu_links = fetch_menu_links(url, headers)
    print(f"T√¨m th·∫•y {len(menu_links)} chuy√™n m·ª•c")

    # Crawl d·ªØ li·ªáu t·ª´ t·ª´ng chuy√™n m·ª•c
    all_articles = []
    for category, link in menu_links.items():
        articles = crawl_category_page(category, link, headers, url)
        all_articles.extend(articles)
        print(f" - ƒê√£ thu th·∫≠p {len(articles)} b√†i vi·∫øt t·ª´ chuy√™n m·ª•c {category}")

    # Chuy·ªÉn d·ªØ li·ªáu th√†nh DataFrame
    df = pd.DataFrame(all_articles)

    if not df.empty:
        # L·∫•y n·ªôi dung v√† th·ªùi gian ƒëƒÉng b√†i
        print("ƒêang tr√≠ch xu·∫•t n·ªôi dung b√†i vi·∫øt...")
        df = extract_article_content(df, headers)

        # L∆∞u v√†o file CSV
        file_exists = os.path.exists(output_path)
        df.to_csv(output_path, mode="a", header=not file_exists, index=False, encoding="utf-8-sig")

        print("‚úÖ ƒê√£ l∆∞u d·ªØ li·ªáu v√†o:", output_path)
        print(f"T·ªïng s·ªë b√†i vi·∫øt: {len(df)}")
        return df
    else:
        print("Kh√¥ng t√¨m th·∫•y b√†i vi·∫øt n√†o")
        return None

if __name__ == "__main__":
    data_vnexpress = main()
    if data_vnexpress is not None:
        print("\n5 b√†i vi·∫øt ƒë·∫ßu ti√™n:")
        print(data_vnexpress.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ƒêang l·∫•y danh s√°ch chuy√™n m·ª•c...
T√¨m th·∫•y 20 chuy√™n m·ª•c
ƒêang crawl: M·ªõi nh·∫•t - https://vnexpress.net/tin-tuc-24h
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c M·ªõi nh·∫•t
ƒêang crawl: Th·ªùi s·ª± - https://vnexpress.net/thoi-su
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c Th·ªùi s·ª±
ƒêang crawl: Th·∫ø gi·ªõi - https://vnexpress.net/the-gioi
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c Th·∫ø gi·ªõi
ƒêang crawl: Kinh doanh - https://vnexpress.net/kinh-doanh
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c Kinh doanh
ƒêang crawl: Ph√°p lu·∫≠t - https://vnexpress.net/phap-luat
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c Ph√°p lu·∫≠t
ƒêang crawl: Khoa h·ªçc c√¥ng ngh·ªá - https://vnexpress.net/khoa-hoc-cong-nghe
 - ƒê√£ thu th·∫≠p 50 b√†i vi·∫øt t·ª´ chuy√™n m·ª•c Khoa h·ªçc c√¥ng ngh·ªá
ƒêang crawl: Th·ªÉ th

In [10]:
data_vnexpress.head(20)

Unnamed: 0,ID,Title,Href,Category,Content,Time
0,4911600,C·ª±u ph√≥ t·ªïng tham m∆∞u tr∆∞·ªüng Nga b·ªã k·∫øt √°n t√π ...,https://vnexpress.net/cuu-pho-tong-tham-muu-tr...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
1,4911600,C·ª±u ph√≥ t·ªïng tham m∆∞u tr∆∞·ªüng Nga b·ªã k·∫øt √°n t√π ...,https://vnexpress.net/cuu-pho-tong-tham-muu-tr...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
2,4911596,200 tr·∫ª c√≥ d·∫•u hi·ªáu nhi·ªÖm ƒë·ªôc ch√¨ v√¨ m√≥n b√°nh ...,https://vnexpress.net/200-tre-co-dau-hieu-nhie...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
3,4911596,200 tr·∫ª c√≥ d·∫•u hi·ªáu nhi·ªÖm ƒë·ªôc ch√¨ v√¨ m√≥n b√°nh ...,https://vnexpress.net/200-tre-co-dau-hieu-nhie...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
4,4911584,T√†i t·ª≠ 75 tu·ªïi ƒë√≥n b·∫°n g√°i ra t√π,https://vnexpress.net/tai-tu-75-tuoi-don-ban-g...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
5,4911584,T√†i t·ª≠ 75 tu·ªïi ƒë√≥n b·∫°n g√°i ra t√π,https://vnexpress.net/tai-tu-75-tuoi-don-ban-g...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
6,4911630,TP HCM ph√¢n c·∫•p h∆°n 2.100 th·ªß t·ª•c h√†nh ch√≠nh v...,https://vnexpress.net/tp-hcm-phan-cap-hon-2-10...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
7,4911630,TP HCM ph√¢n c·∫•p h∆°n 2.100 th·ªß t·ª•c h√†nh ch√≠nh v...,https://vnexpress.net/tp-hcm-phan-cap-hon-2-10...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
8,4911661,Ch√°y c∆∞ x√° 8 ng∆∞·ªùi t·ª≠ vong do 'ch·∫≠p ƒë∆∞·ªùng d√¢y ...,https://vnexpress.net/chay-cu-xa-8-nguoi-tu-vo...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
9,4911661,Ch√°y c∆∞ x√° 8 ng∆∞·ªùi t·ª≠ vong do 'ch·∫≠p ƒë∆∞·ªùng d√¢y ...,https://vnexpress.net/chay-cu-xa-8-nguoi-tu-vo...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian


In [11]:
data_vnexpress.drop_duplicates(subset=["ID", "Title", "Href"], keep="first", inplace=True)

In [12]:
data_vnexpress

Unnamed: 0,ID,Title,Href,Category,Content,Time
0,4911600,C·ª±u ph√≥ t·ªïng tham m∆∞u tr∆∞·ªüng Nga b·ªã k·∫øt √°n t√π ...,https://vnexpress.net/cuu-pho-tong-tham-muu-tr...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
2,4911596,200 tr·∫ª c√≥ d·∫•u hi·ªáu nhi·ªÖm ƒë·ªôc ch√¨ v√¨ m√≥n b√°nh ...,https://vnexpress.net/200-tre-co-dau-hieu-nhie...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
4,4911584,T√†i t·ª≠ 75 tu·ªïi ƒë√≥n b·∫°n g√°i ra t√π,https://vnexpress.net/tai-tu-75-tuoi-don-ban-g...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
6,4911630,TP HCM ph√¢n c·∫•p h∆°n 2.100 th·ªß t·ª•c h√†nh ch√≠nh v...,https://vnexpress.net/tp-hcm-phan-cap-hon-2-10...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
8,4911661,Ch√°y c∆∞ x√° 8 ng∆∞·ªùi t·ª≠ vong do 'ch·∫≠p ƒë∆∞·ªùng d√¢y ...,https://vnexpress.net/chay-cu-xa-8-nguoi-tu-vo...,M·ªõi nh·∫•t,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
...,...,...,...,...,...,...
851,4910202,"Mua ƒëi·ªÅu khi·ªÉn tivi Shopee 45 ngh√¨n, c√¥ng th·ª£ ...",https://vnexpress.net/mua-dieu-khien-tivi-shop...,√ù ki·∫øn,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
853,4909690,Vi√™n g·∫°ch 'd·∫±n m·∫∑t' khi t√¥i ƒë·ªó xe ch·∫Øn c·ª≠a nh√†...,https://vnexpress.net/vien-gach-dan-mat-khi-to...,√ù ki·∫øn,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
855,4909889,H√†nh tr√¨nh t√¥i 'tr∆∞·ªüng th√†nh' sau 5 nƒÉm chuy·ªÉn...,https://vnexpress.net/sap-nhap-tinh-thanh-hanh...,√ù ki·∫øn,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
856,4909893,H√†nh tr√¨nh sang M·ªπ t√¨m ƒëam m√™ c·ªßa m·ªôt ƒë·ª©a tr·∫ª ...,https://vnexpress.net/hanh-trinh-sang-my-tim-d...,√ù ki·∫øn,,Kh√¥ng t√¨m th·∫•y th·ªùi gian
