In [1]:
from utils import get_urls_of_type
from bs4 import BeautifulSoup, NavigableString, Tag
from datetime import datetime
import requests
import tqdm
import json
import os
import re

In [2]:
type_dict = {
             'thoi-su': ['chinh-tri', 'dan-sinh', 'lao-dong-viec-lam', 'giao-thong', 'mekong', 'quy-hy-vong'],
             'goc-nhin': ['binh-luan-nhieu', 'chinh-tri-chinh-sach', 'y-te-suc-khoe', 'kinh-doanh-quan-tri', 'giao-duc-tri-thuc', 'moi-truong', 'van-hoa-doi-song', 'covid-19', 'tac-gia'],
             'the-gioi': ['tu-lieu', 'phan-tich', 'nguoi-viet-5-chau', 'cuoc-song-do-day', 'quan-su'],
             'kinh-doanh': ['net-zero', 'quoc-te', 'doanh-nghiep', 'chung-khoan', 'ebank', 'vi-mo', 'tien-cua-toi', 'hang-hoa'],
             'bat-dong-san': ['chinh-sach', 'thi-truong', 'khong-gian-song', 'tu-van'],
             'khoa-hoc': ['khoa-hoc-trong-nuoc', 'pii-doi-moi-sang-tao', 'tin-tuc', 'phat-minh', 'ung-dung', 'the-gioi-tu-nhien', 'thuong-thuc'],
             'giai-tri': ['gioi-sao', 'sach', 'video', 'phim', 'nhac', 'thoi-trang', 'lam-dep', 'san-khau-my-thuat'],
             'the-thao': ['bong-da', 'du-lieu-bong-da', 'marathon', 'tennis', 'cac-mon-khac', 'hau-truong'],
             'phap-luat': ['ho-so-pha-an', 'tu-van'],
             'giao-duc': ['tin-tuc', 'tuyen-sinh', 'chan-dung', 'du-hoc', 'thao-luan', 'hoc-tieng-anh', 'giao-duc-40'],
             'suc-khoe': ['tin-tuc', 'cac-benh', 'song-khoe', 'vaccine'],
             'doi-song': ['nhip-song', 'to-am', 'bai-hoc-song', 'cooking', 'tieu-dung'],
             'du-lich': ['diem-den', 'am-thuc', 'dau-chan', 'tu-van', 'cam-nang'],
             'so-hoa': ['cong-nghe', 'san-pham', 'blockchain'],
             'xe': ['thi-truong', 'dien-dan'],
             'y-kien': ['thoi-su', 'doi-song']
            }

In [3]:
os.makedirs("data/urls", exist_ok=True)

DESCRIPTION

In [4]:
vietnamese_days = {
    "Chủ nhật": "Sunday",
    "Thứ hai": "Monday",
    "Thứ ba": "Tuesday",
    "Thứ tư": "Wednesday",
    "Thứ năm": "Thursday",
    "Thứ sáu": "Friday",
    "Thứ bảy": "Saturday"
}

def convert_vietnamese_date(date_str):

    for vn_day, en_day in vietnamese_days.items():
        if vn_day in date_str:
            date_str = date_str.replace(vn_day, en_day)
            break

    date_str = re.sub(r"\s\(GMT[+-]\d{1,2}\)", "", date_str)

    return date_str

crawl_url_list = []
f_urls = open("data/urls/surl_list.jsonl", "w", encoding="utf-8")

total_count = 0 
numlost = 0
max_numlost = 3

for type in tqdm.tqdm(type_dict, desc="Processing types"):
    for sub_type in tqdm.tqdm(type_dict[type], desc=f"Processing subtypes for {type}"):
        target_count = 33
        i = 1
        current_count = 0
        no_articles_in_a_row = 0
        
        while current_count < target_count:
            url = f"https://vnexpress.net/{type}/{sub_type}-p{i}"
            content = requests.get(url)

            if content.status_code != 200:
                print(f"{content.status_code} Error {type}|{sub_type}")
                break
            
            soup = BeautifulSoup(content.content, "html.parser")
            tmp_title_list = soup.find_all(class_="title-news")
            
            if len(tmp_title_list) == 0:
                no_articles_in_a_row += 1
                print(f"No article {type}|{sub_type}.")
                break
            
            else:
                no_articles_in_a_row = 0
                for title in tmp_title_list:
                    try:
                        article_info = title.find_all("a")                                     
                        if article_info:
                            article_url = article_info[0].get("href")
                            article_title = article_info[0].get("title")
                            if article_url and article_title:
                                description = title.find_next("p", class_="description") 
                                if description:
                                    article_page_content = requests.get(article_url).content
                                    article_page_soup = BeautifulSoup(article_page_content, "html.parser")
                                    article_date = article_page_soup.find("span", class_="date").text.strip() if article_page_soup.find("span", class_="date") else None
                                    if article_date:
                                        article_date = convert_vietnamese_date(article_date)
                                        article_date_obj = datetime.strptime(article_date, "%A, %d/%m/%Y, %H:%M")
                                        article_year = article_date_obj.year
                                        description_text = description.get_text(strip=True)
                                        if article_year >= 2022:
                                            article_content = ""
                                            for p_tag in article_page_soup.find_all("p", class_="Normal"):
                                                article_content += p_tag.get_text(" ", strip=True) + " " 
                                            words = article_content.split()
                                            word_num = len(words)
                                            if word_num > 0 and word_num < 1000:
                                                sample = {
                                                "url": article_url,
                                                "title": article_title,
                                                "description": description_text,
                                                "date": article_date,
                                                "type": f"{type}|{sub_type}",
                                                "words": word_num,
                                                "content": article_content,
                                                }                                             
                                                f_urls.write(json.dumps(sample, indent=4, ensure_ascii=False) + "\n")
                                                current_count += 1
                                                total_count += 1
                                                if current_count % 1 == 0:
                                                    print(f"Total: {total_count}")
                                                if current_count >= target_count:
                                                    break
                                                numlost = 0
                                                
                        else:
                            numlost += 1
                            print(f"Error {type}|{sub_type}. Continue...")
                            if numlost >= max_numlost:
                                print(f"Error {type}|{sub_type}. Exiting...")
                                break
                            
                    except Exception as e:
                        print(f"Error: {e}")
                        pass
            
            if numlost >= max_numlost:
                print(f"Error {type}|{sub_type}. Exiting....")
                break
            i += 1

        if numlost >= max_numlost:
            print(f"Error {type}|{sub_type}. Exiting.....")
            break

f_urls.close()


Processing types:   0%|          | 0/16 [00:00<?, ?it/s]

Total: 1
Total: 2
Total: 3
Total: 4
Total: 5
Total: 6
Total: 7
Total: 8
Total: 9
Total: 10
Total: 11
Total: 12
Total: 13
Total: 14
Total: 15
Total: 16
Total: 17
Total: 18
Total: 19
Total: 20
Total: 21
Total: 22
Total: 23
Total: 24
Total: 25
Total: 26
Total: 27
Total: 28
Total: 29
Total: 30
Total: 31
Total: 32




Total: 33
Total: 34
Total: 35
Total: 36
Total: 37
Total: 38
Total: 39
Total: 40
Total: 41
Total: 42
Total: 43
Total: 44
Total: 45
Total: 46
Total: 47
Total: 48
Total: 49
Total: 50
Total: 51
Total: 52
Total: 53
Total: 54
Total: 55
Total: 56
Total: 57
Total: 58
Total: 59
Total: 60
Total: 61
Total: 62
Total: 63
Total: 64
Total: 65




Total: 66
Total: 67
Total: 68
Total: 69
Total: 70
Total: 71
Total: 72
Total: 73
Total: 74
Total: 75
Total: 76
Total: 77
Total: 78
Total: 79
Total: 80
Total: 81
Total: 82
Total: 83
Total: 84
Total: 85
Total: 86
Total: 87
Total: 88
Total: 89
Total: 90
Total: 91
Total: 92
Total: 93
Total: 94
Total: 95
Total: 96
Total: 97
Total: 98




Total: 99
Total: 100
Total: 101
Total: 102
Total: 103
Total: 104
Total: 105
Total: 106
Total: 107
Total: 108
Total: 109
Total: 110
Total: 111
Total: 112
Total: 113
Total: 114
Total: 115
Total: 116
Total: 117
Total: 118
Total: 119
Total: 120
Total: 121
Total: 122
Total: 123
Total: 124
Total: 125
Total: 126
Total: 127
Total: 128
Total: 129
Total: 130
Total: 131




Total: 132
Total: 133
Total: 134
Total: 135
Total: 136
Total: 137
Total: 138
Total: 139
Total: 140
Total: 141
Total: 142
Total: 143
Total: 144
Total: 145
Total: 146
Total: 147
Total: 148
Total: 149
Total: 150
Total: 151
Total: 152
Total: 153
Total: 154
Total: 155
Total: 156
Total: 157
Total: 158
Total: 159
Total: 160
Total: 161
Total: 162
Total: 163
Total: 164




Total: 165
Total: 166
Total: 167
Total: 168
Total: 169
Total: 170
Total: 171
Total: 172
Total: 173
Total: 174
Total: 175
Total: 176
Total: 177
Total: 178
Total: 179
Total: 180
Total: 181
Total: 182
Total: 183
Total: 184
Total: 185
Total: 186
Total: 187
Total: 188
Total: 189
Total: 190
Total: 191
Total: 192
Total: 193
Total: 194
Total: 195
Total: 196
Total: 197


Processing subtypes for thoi-su: 100%|██████████| 6/6 [03:08<00:00, 31.40s/it]
Processing types:   6%|▋         | 1/16 [03:08<47:05, 188.39s/it]

Total: 198




Total: 199
Total: 200
Total: 201
Total: 202
Total: 203
Total: 204
Total: 205
Total: 206
Total: 207
Total: 208
Total: 209
Total: 210
Total: 211
Total: 212
Total: 213
Total: 214
Total: 215
Total: 216
Total: 217
Total: 218
Total: 219
Total: 220
Total: 221
Total: 222
Total: 223
Total: 224
Total: 225
Total: 226
Total: 227
Total: 228
Total: 229
Total: 230




Total: 231
Total: 232
Total: 233
Total: 234
Total: 235
Total: 236
Total: 237
Total: 238
Total: 239
Total: 240
Total: 241
Total: 242
Total: 243
Total: 244
Total: 245
Total: 246
Total: 247
Total: 248
Total: 249
Total: 250
Total: 251
Total: 252
Total: 253
Total: 254
Total: 255
Total: 256
Total: 257
Total: 258


Processing subtypes for goc-nhin:  11%|█         | 1/9 [01:52<15:03, 112.88s/it]
Processing types:   6%|▋         | 1/16 [05:01<1:15:19, 301.27s/it]


KeyboardInterrupt: 