# Kết nối với Drive cá nhân


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


Vì việc kết nối này sử dụng Drive cá nhân, nên t chưa tìm được cách chia sẻ dữ liệu để sử dụng chung. Đoạn code dưới đây sẽ tạo một bản sao của thư mục vào Drive cá nhân của mọi người và tải dữ liệu vào đó.
<br/>
Việc code thì có thể làm chung trong file này cũng được

In [None]:
cd "/content/drive/MyDrive/"

/content/drive/MyDrive


In [None]:
import os
from pathlib import Path


# Create directory if not exist
if not Path('./BTL_Data_Science').is_dir():
    os.mkdir('BTL_Data_Science')

In [None]:
cd "./BTL_Data_Science/"

/content/drive/MyDrive/BTL_Data_Science


In [None]:
!pwd

/content/drive/MyDrive/BTL_Data_Science


# Crawler

### Chuẩn bị

In [None]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import csv
import os
import pathlib
import sys

import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup


### Định nghĩa các hàm

In [None]:
def get_links(topic_link=None, title_tag='h2'):
    """Get news URLs from topic link"""
    try:
        response = requests.get(topic_link)
        soup = BeautifulSoup(response.content, "html.parser")
        titles = soup.find_all(title_tag, class_='title-news')
        links = [link.find('a').attrs["href"] for link in titles]

        return links
    except Exception as e:
        print('Exception {} has happened when get links from URL: {}'.format(type(e).__name__, topic_link))


In [None]:
def find_topic(soup=None):
    # A part of HTML which store topic of news
    t = soup.find('ul', class_='breadcrumb')  

    # Topic of news (VD: KhoaHoc)
    topic = (t.find('a').attrs["data-medium"])[5:]
    
    return topic


In [None]:
def find_date(soup=None):
    t = soup.find(class_='header-content')  
    
    date = t.find(class_="date").get_text().split(", ")[1]

    return date
    

In [None]:
def find_header(soup=None):
    header = soup.find(class_='title-detail').get_text()
    return header
    

In [None]:
def find_tag(link):
   response = requests.get(link)
   soup = BeautifulSoup(response.content, "html.parser")
   header = soup.find(class_='title-detail').get_text()
   return header
   

In [None]:
print(find_tag('https://vnexpress.net/tam-quan-trong-cua-tiem-vaccine-cho-phu-nu-truoc-trong-khi-mang-thai-4543330.html'))

Tầm quan trọng của tiêm vaccine cho phụ nữ trước, trong khi mang thai 


In [None]:
def crawl_data(link=None, data_dir=Path('./data')):
    """Get data from URL and save to file"""
    try:
        if not Path(data_dir).is_dir():
            os.mkdir(data_dir)

        # Get HTML source code from link
        response = requests.get(link)
        soup = BeautifulSoup(response.content, "html.parser")

        topic = find_topic(soup)

        # Create new directory if this topic is not crawled
        topic_dir = data_dir / topic
        if not topic_dir.is_dir():
            os.mkdir(topic_dir)

        # Index file which stores URL of all crawled news in the same topic
        index_filename = '_Index.csv'.format(topic)  
        index_filepath = topic_dir / index_filename

        date = find_date(soup)
        #header = find_header(soup)

        # Text file which stores content of the news in link
        if not index_filepath.is_file():
            # If Index file is not exist
            index = 1
        else:
            df = pd.read_csv(index_filepath, header=None, index_col=None)
            index = int(df.iloc[-1, 0]) + 1

            # Stop if this link is crawled
            for crawled_link in df.iloc[:, 2]:
                if link == crawled_link:
                  return None

        text_filename = 'news{}.txt'.format(index)
        text_path = topic_dir / text_filename

        # Add new URL to Index file of the topic
        with open(index_filepath, mode='a+') as file:
            writer = csv.writer(file)
            writer.writerow([index, text_filename, str(link), date])

        # Write content to file
        with open(text_path, 'w', encoding='UTF-8') as f:
          # Write title of news
          f.write(soup.find('h1', class_='title-detail').text + '\n\n')

          # Write description of news
          f.write(soup.find('p', class_='description').text)

          # Write all content of news
          contents = soup.findAll('p', class_='Normal')
          for i in range(0, len(contents) - 1):
              f.write('\n' + contents[i].text)

    except Exception as e:
        print('Exception {} has happened when crawl data from URL: {}'.format(type(e).__name__, link))


In [None]:
# URL = 'https://vnexpress.net/tam-quan-trong-cua-tiem-vaccine-cho-phu-nu-truoc-trong-khi-mang-thai-4543330.html'
# crawl_data(URL, DATA_DIR)


### Tiến hành crawl dữ liệu

In [None]:
# a = 0  # News ID
# tl = []  # Crawled titles

CURRENT_DIR = Path(os.getcwd())  # Main directory, depends on where this code run (Google Colab or Local)
DATA_DIR = CURRENT_DIR / 'data3'
MIN_PAGE = 1
MAX_PAGE = 40


In [None]:
if not DATA_DIR.is_dir():
  os.mkdir(DATA_DIR)

links2 = [
    'https://vnexpress.net/suc-khoe',
    'https://vnexpress.net/du-lich',
    'https://vnexpress.net/so-hoa',
    'https://vnexpress.net/kinh-doanh',
    'https://vnexpress.net/giai-tri',
    'https://vnexpress.net/the-thao'
]

links3 = [
    'https://vnexpress.net/thoi-su',
    'https://vnexpress.net/goc-nhin',
    'https://vnexpress.net/the-gioi',
    'https://vnexpress.net/khoa-hoc',
    'https://vnexpress.net/phap-luat',
    'https://vnexpress.net/giao-duc',
    'https://vnexpress.net/oto-xe-may',
    'https://vnexpress.net/hai',
]

try:
    
    for link in links2:
        news_links = []

        for i in tqdm.tqdm(list(range(MIN_PAGE, MAX_PAGE + 1)), desc=link):
            sub_link = link + '-p' + str(i)  # Topic link with page number
            news_links += get_links(sub_link, 'h2')

        for link in tqdm.tqdm(news_links, desc='Crawling'):
            crawl_data(link, DATA_DIR)

    for link in links3:
        news_links = []

        for i in tqdm.tqdm(list(range(MIN_PAGE, MAX_PAGE + 1)), desc=link):
            sub_link = link + '-p' + str(i)  # Topic link with page number
            news_links += get_links(sub_link, 'h3')

        for link in tqdm.tqdm(news_links, desc='Crawling'):
            crawl_data(link, DATA_DIR)
except Exception as e:
    print(sys.exc_info()[2])


https://vnexpress.net/suc-khoe: 100%|██████████| 40/40 [01:38<00:00,  2.45s/it]
Crawling: 100%|██████████| 570/570 [17:14<00:00,  1.82s/it]
https://vnexpress.net/du-lich: 100%|██████████| 40/40 [01:37<00:00,  2.44s/it]
Crawling:   2%|▏         | 13/570 [00:23<15:27,  1.66s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/du-khach-do-xo-chup-hinh-voi-mai-anh-dao-o-da-lat-4563608.html


Crawling:   3%|▎         | 16/570 [00:29<17:18,  1.87s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/thoi-su/chua-tam-chuc-dong-nghit-nguoi-du-xuan-4563339.html


Crawling:   9%|▉         | 50/570 [01:28<14:55,  1.72s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-banh-truyen-thong-thu-250-trieu-ngay-sat-tet-4561902.html


Crawling:  12%|█▏        | 71/570 [02:05<14:03,  1.69s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/an-gi-choi-dau-cung-gia-dinh-o-singapore-tet-quy-mao-4559003.html


Crawling:  24%|██▎       | 135/570 [03:55<12:47,  1.76s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/du-khach-trai-nghiem-phim-truong-nhat-ban-tai-moc-chau-4555435.html


Crawling:  29%|██▉       | 164/570 [04:50<12:41,  1.88s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-ca-dung-6-tan-muoi-de-nuong-moi-thang-4551520.html


Crawling:  32%|███▏      | 182/570 [05:24<11:09,  1.73s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/sa-pa-xu-suong-mu-tinh-giac-4545243.html


Crawling:  33%|███▎      | 186/570 [05:31<11:01,  1.72s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/khach-tay-bat-ngo-voi-khong-khi-giang-sinh-o-viet-nam-4552319.html


Crawling:  34%|███▍      | 193/570 [05:44<11:49,  1.88s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/thoi-su/trai-nghiem-pho-am-thuc-ngu-xa-dem-dau-tien-mo-cua-4552045.html


Crawling:  35%|███▌      | 201/570 [06:00<11:24,  1.85s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/tan-huong-bua-tiec-giang-sinh-ruc-ro-tai-singapore-4545367.html


Crawling:  37%|███▋      | 213/570 [06:23<10:19,  1.74s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/trai-nghiem-hang-cay-hong-moc-chau-triu-qua-o-thu-do-4549743.html


Crawling:  42%|████▏     | 241/570 [07:17<10:28,  1.91s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-thit-xien-nuong-tap-nap-khach-mua-lanh-4547954.html


Crawling:  45%|████▍     | 254/570 [07:41<08:55,  1.69s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/nhip-song/quan-luon-nghe-an-60-nam-giu-huong-vi-co-4544772.html


Crawling:  50%|████▉     | 283/570 [08:34<08:11,  1.71s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/nhip-song/pho-tom-hum-bat-da-ban-500-nghin-dong-4544289.html


Crawling:  60%|█████▉    | 341/570 [10:23<06:42,  1.76s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/cach-sun-world-ke-chuyen-o-moi-vung-dat-4531862.html


Crawling:  65%|██████▌   | 371/570 [11:20<06:22,  1.92s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-mi-bay-kieu-trung-quoc-ban-700-suat-mot-ngay-4538832.html


Crawling:  71%|███████   | 404/570 [12:22<05:09,  1.86s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/du-khach-trai-nghiem-san-bong-o-lung-chung-nui-4536614.html


Crawling:  76%|███████▌  | 434/570 [13:17<04:12,  1.85s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/tam-giac-mach-no-ro-o-cao-nguyen-da-dong-van-4533475.html


Crawling:  82%|████████▏ | 470/570 [14:26<03:03,  1.83s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/nhung-dieu-thu-vi-ve-italy-4530962.html


Crawling:  91%|█████████ | 517/570 [15:53<01:39,  1.88s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-banh-ran-ban-5-000-chiec-moi-ngay-4528818.html


Crawling:  92%|█████████▏| 527/570 [16:12<01:16,  1.79s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/du-lich/chua-dat-vang-giua-canh-dong-lua-4526050.html


Crawling:  93%|█████████▎| 531/570 [16:19<01:08,  1.75s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-thit-nuong-dap-chan-ban-gan-2-000-chiec-moi-ngay-4528360.html


Crawling:  97%|█████████▋| 551/570 [16:59<00:33,  1.75s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/food/quan-thit-nuong-mac-khen-ban-tram-can-moi-ngay-4526206.html


Crawling: 100%|██████████| 570/570 [17:35<00:00,  1.85s/it]
https://vnexpress.net/so-hoa: 100%|██████████| 40/40 [01:39<00:00,  2.48s/it]
Crawling:  29%|██▉       | 165/570 [04:48<11:17,  1.67s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/ceo-vinai-nang-luc-phat-trien-ai-viet-nam-ngang-hang-the-gioi-4548503.html


Crawling:  39%|███▉      | 225/570 [06:37<10:06,  1.76s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/hanh-trinh-10-nam-tech-awards-4557007.html


Crawling:  47%|████▋     | 269/570 [07:57<09:33,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/xu-huong-cong-nghe-2023-4554586.html


Crawling:  50%|█████     | 287/570 [08:29<07:58,  1.69s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/25-nam-kien-tao-cuoc-song-so-cua-fpt-telecom-4553761.html


Crawling:  52%|█████▏    | 299/570 [08:52<08:34,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/cong-nghe-giup-dvm-s2-manh-me-nhat-dai-dieu-hoa-trung-tam-samsung-4547800.html


Crawling:  53%|█████▎    | 302/570 [08:57<08:33,  1.92s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/6-diem-nhan-cong-nghe-the-gioi-2022-4551586.html


Crawling:  56%|█████▌    | 320/570 [09:31<07:40,  1.84s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/danh-gia-iphone-14-plus-su-thuc-dung-bi-lang-quen-4552522.html


Crawling:  56%|█████▋    | 321/570 [09:33<07:29,  1.81s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/diem-nhan-khoa-hoc-cong-nghe-2022-4550297.html


Crawling:  69%|██████▉   | 395/570 [11:47<05:18,  1.82s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/nam-chao-dao-cua-thi-truong-tien-so-4547707.html


Crawling:  81%|████████  | 459/570 [13:45<03:41,  1.99s/it]

Exception AttributeError has happened when crawl data from URL: https://startup.vnexpress.net/tin-tuc/startup-2022/goi-von-thanh-cong-chi-chung-minh-startup-tiem-nang-4544363.html


Crawling:  87%|████████▋ | 494/570 [14:50<02:19,  1.84s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/ly-do-z-flip-series-thong-tri-nganh-hang-smartphone-gap-4540529.html


Crawling: 100%|██████████| 570/570 [17:12<00:00,  1.81s/it]
https://vnexpress.net/kinh-doanh: 100%|██████████| 40/40 [01:36<00:00,  2.41s/it]
Crawling:   2%|▏         | 13/570 [00:20<14:29,  1.56s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/co-phieu-gia-tri-la-gi-4565925.html


Crawling:  38%|███▊      | 214/570 [06:05<10:44,  1.81s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/thoi-su/dao-nhat-tan-tang-gia-ngay-cuoi-nam-4562552.html


Crawling:  42%|████▏     | 241/570 [06:50<10:03,  1.83s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/ba-mai-kieu-lien-ke-ve-lon-sua-bot-made-in-vietnam-dau-tien-4550976.html


Crawling:  43%|████▎     | 244/570 [06:55<09:56,  1.83s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/phu-my-hung-to-chuc-hoi-hoa-xuan-tao-diem-nhan-khu-trung-tam-4560835.html


Crawling:  46%|████▌     | 260/570 [07:24<09:10,  1.77s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/co-phieu-dau-co-la-gi-4553622.html


Crawling: 100%|██████████| 570/570 [16:36<00:00,  1.75s/it]
https://vnexpress.net/giai-tri: 100%|██████████| 40/40 [01:36<00:00,  2.42s/it]
Crawling:   6%|▌         | 32/570 [00:55<14:47,  1.65s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/vong-nhi-546


Crawling:   8%|▊         | 45/570 [01:18<14:15,  1.63s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/babylon-545


Crawling:   9%|▉         | 51/570 [01:28<14:21,  1.66s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/chi-chi-em-em-2-543


Crawling:  12%|█▏        | 69/570 [02:00<14:42,  1.76s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/the-fabelmans-542


Crawling:  20%|█▉        | 113/570 [03:21<14:51,  1.95s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/thien-long-bat-bo-kieu-phong-truyen-541


Crawling:  21%|██        | 118/570 [03:30<13:40,  1.82s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/nhip-song/gioi-tre-trai-nghiem-thu-vui-voi-hai-doc-thoai-4555250.html


Crawling:  24%|██▍       | 139/570 [04:10<12:51,  1.79s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/tar-540


Crawling:  28%|██▊       | 161/570 [04:52<13:21,  1.96s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/nha-ba-nu-539


Crawling:  34%|███▍      | 196/570 [05:58<12:25,  1.99s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/knives-out-2-glass-onion-538


Crawling:  48%|████▊     | 271/570 [08:21<09:13,  1.85s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/puss-in-boots-the-last-wish-537


Crawling:  51%|█████     | 289/570 [08:56<08:54,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/the-pale-blue-eye-536


Crawling:  74%|███████▍  | 423/570 [13:18<04:54,  2.01s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/the-banshees-of-inisherin-534


Crawling:  84%|████████▍ | 478/570 [15:09<03:11,  2.08s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/wednesday-533


Crawling:  85%|████████▍ | 482/570 [15:18<02:58,  2.03s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/huyen-thoai-pele-532


Crawling:  88%|████████▊ | 503/570 [15:59<02:10,  1.95s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/alice-in-borderland-50


Crawling:  89%|████████▉ | 507/570 [16:07<02:02,  1.95s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/emily-in-paris-104


Crawling:  90%|█████████ | 513/570 [16:18<01:51,  1.96s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/spy-x-family-531


Crawling:  99%|█████████▊| 562/570 [18:25<00:15,  1.96s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/giai-tri/phim/thu-vien-phim/thanh-soi-530


Crawling: 100%|██████████| 570/570 [18:41<00:00,  1.97s/it]
https://vnexpress.net/the-thao: 100%|██████████| 40/40 [01:39<00:00,  2.49s/it]
Crawling:  11%|█         | 62/570 [01:43<13:53,  1.64s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/loat-ngoi-sao-tranh-vo-dich-giai-chay-dem-tp-hcm-4567241.html


Crawling:  16%|█▌        | 91/570 [02:33<15:05,  1.89s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/nhung-dieu-can-biet-ve-vm-ho-chi-minh-city-midnight-4566013.html


Crawling:  31%|███       | 175/570 [05:07<11:58,  1.82s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/ky-chuyen-nhuong-cach-mang-cua-chelsea-4565744.html


Crawling:  40%|███▉      | 227/570 [06:54<10:52,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/the-thao/hlv-park-trong-long-cau-thu-va-nguoi-ham-mo-4565104.html


Crawling:  41%|████      | 231/570 [07:00<09:26,  1.67s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/the-thao/hlv-park-va-ket-thuc-chua-tron-voi-bong-da-viet-4562354.html


Crawling:  69%|██████▉   | 393/570 [12:03<05:40,  1.92s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/bi-quyet-thanh-cong-cua-haaland-4562320.html


Crawling:  83%|████████▎ | 471/570 [14:27<03:07,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://vnexpress.net/hlv-park-da-vuc-day-bong-da-viet-nam-the-nao-4560385.html


Crawling:  99%|█████████▉| 563/570 [17:21<00:13,  1.89s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/the-thao/tien-dao-so-mot-thai-lan-tap-rieng-truoc-tran-viet-nam-4559437.html


Crawling:  99%|█████████▉| 564/570 [17:22<00:10,  1.75s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/the-thao/buoi-tap-cuoi-cung-cua-hlv-park-tren-san-my-dinh-4559436.html


Crawling: 100%|█████████▉| 569/570 [17:33<00:01,  1.90s/it]

Exception AttributeError has happened when crawl data from URL: https://video.vnexpress.net/tin-tuc/the-thao/5-nam-ong-park-giup-viet-nam-pha-dop-so-thai-lan-4559609.html


Crawling: 100%|██████████| 570/570 [17:35<00:00,  1.85s/it]
https://vnexpress.net/thoi-su: 100%|██████████| 40/40 [01:39<00:00,  2.50s/it]
Crawling:  22%|██▏       | 131/589 [03:35<12:54,  1.69s/it]

### Crawl bằng thư viện *newspaper*

In [None]:
import yaml
from yaml.loader import SafeLoader


with open('newspaper_link.yaml', 'r') as f:
    data = list(yaml.load_all(f, Loader=SafeLoader))
    vietnamese_link = data[0]['Vietnamese']

print(vietnamese_link)


In [None]:
import newspaper


url = 'https://vnexpress.net/kich-ban-nao-cho-duc-o-vong-cuoi-world-cup-2022-4541291.html'

article = newspaper.Article(url)
article.download()
article.parse()

print(article.publish_date)


In [None]:
import newspaper
import yaml
import os


CURRENT_DIR = Path(os.getcwd())
DATA_DIR = CURRENT_DIR / 'data'
MAX_ARTICLE_COUNT = 100

for link in vietnamese_link:
  news_name = link.split('//')[1].split('.')[0]

  news_paper = newspaper.build(link)
  if not Path(DATA_DIR / news_name).is_dir():
    os.mkdir(str(DATA_DIR / news_name))

  article_count = 0

  for article in news_paper.articles:
    article_count += 1
    if article_count > MAX_ARTICLE_COUNT:
      break

    # print(article.text)
    
    url = article.url
    article = newspaper.Article(url)

    article.download()
    article.parse()

    print(article.url)
