### Scraping News URL from Detik.com

In [223]:
import re
import bs4
import json
import time
import timeit
import threading
import requests
import numpy as np
import pandas as pd
import concurrent.futures
from datetime import date, timedelta, datetime

In [259]:
def datelist_generator(from_date,to_date,format_output="%d/%m/%Y"):
    dates = []
    step = timedelta(days=1)
    while from_date<=to_date:
        # print(from_date)
        dates.append(from_date.strftime(format_output))
        from_date+=step
    return dates

def reformat_date_to_str(tobe_reformat_date,format_output):
    return tobe_reformat_date.strftime(format_output)


def get_soup(link_url):
    htmltext = requests.get(link_url).text
    soup = bs4.BeautifulSoup(htmltext,'html.parser')
    return soup

def get_max_page(soup):
    max_page = 0
    for element_pagination in soup.find_all(name="a",attrs={"class":"pagination__item itp-pagination"}):
        if re.match(r"\d+",element_pagination.string):
            if max_page<int(element_pagination.string):
                max_page = int(element_pagination.string)
    print(f"max_page: {max_page}")
    return max_page

def get_n_news_perpage(soup):
    return len(soup.find_all("article"))

def collect_urls_perpage(soup,n_news_perpage):
    urls = []
    for l in range(n_news_perpage):
        li_url = soup.find_all("article")[l].find('a').get("href")
        urls.append(li_url)
    return urls

def get_urls_task(page_i,keyword,from_date,to_date):
    template_i = f"https://www.detik.com/search/searchnews?query={keyword}&page={page_i}&result_type=relevansi&siteid=3&fromdatex={from_date}&todatex={to_date}"    
    soup_i = get_soup(template_i)
    n_news_perpage = get_n_news_perpage(soup_i)
    urls = collect_urls_perpage(soup_i,n_news_perpage)
    urls = list(set(urls))
    return urls

def detik_page_url_generator(from_date:str,to_date:str,keyword:str):
    """
    - one keyword
    - format from_data and to_date in string 'dd/mm/yyyy'
    """
    page=1
    template = f"https://www.detik.com/search/searchnews?query={keyword}&page={page}&result_type=relevansi&siteid=3&fromdatex={from_date}&todatex={to_date}"
    soup = get_soup(template)
    max_page = get_max_page(soup)
    
    # synchronous
    # for page_i in range(1,max_page+1):
    #     template_i = f"https://www.detik.com/search/searchnews?query={keyword}&page={page_i}&result_type=relevansi&siteid=3&fromdatex={from_date}&todatex={to_date}"    
    #     soup_i = get_soup(template_i)
    #     n_news_perpage = get_n_news_perpage(soup_i)
    #     urls = collect_urls_perpage(soup_i,n_news_perpage)
    #     urls = list(set(urls))
        # urls_all.extend(urls)

    # asynchronous
    start_time = timeit.default_timer()
    semaphore = threading.BoundedSemaphore(4)
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        future_task = [executor.submit(get_urls_task, page_i=page_i,keyword=keyword,from_date=from_date,to_date=to_date) for page_i in range(1,max_page+1)] 
        result_task = [future.result() for future in concurrent.futures.as_completed(future_task)]
    end_time = timeit.default_timer()
    urls_all = []
    for urls_perpage in result_task:
        urls_all.extend(urls_perpage)
    urls_all = list(set(np.array(urls_all)))
    print(f"Finished collect all url, total : {len(urls_all)}, {end_time - start_time:.2f}s")
    return urls_all
    

In [260]:
urls_all = detik_page_url_generator(from_date="01/01/2024",to_date="29/08/2024",keyword="ekonomi")

max_page: 356
Finished collect all url, total : 3234, 78.58s


In [267]:
keywords = ['demo','anies','gempa','ekonomi','politik','teknologi','olahraga','wisata','musik','film']

all_keywords_urls_dict = {}
all_keywords_urls_list = []
for keyword in keywords:
    print(f"/============/{keyword}/============/")
    urls_keyword = detik_page_url_generator(from_date="01/01/2024",to_date="29/08/2024",keyword=keyword)
    all_keywords_urls_list.extend(urls_keyword)
    all_keywords_urls_dict[keyword] = urls_keyword

print(f"Total URL: {len(all_keywords_urls_list)}")

uniq_urls = list(set(all_keywords_urls_list))
print(f"Unique url collected: {len(uniq_urls)}")

max_page: 88
Finished collect all url, total : 796, 17.32s
max_page: 454
Finished collect all url, total : 3998, 96.35s
max_page: 82
Finished collect all url, total : 799, 14.52s
max_page: 356
Finished collect all url, total : 3248, 77.18s
max_page: 841
Finished collect all url, total : 6891, 168.57s
max_page: 196
Finished collect all url, total : 1837, 34.74s
max_page: 70
Finished collect all url, total : 657, 14.31s
max_page: 85
Finished collect all url, total : 820, 16.44s
max_page: 42
Finished collect all url, total : 416, 7.35s
max_page: 43
Finished collect all url, total : 422, 8.04s
Total URL: 19884
Unique url collected: 16643


In [265]:
from newspaper import Article
from newspaper.utils import BeautifulSoup

def get_news(url):
    article = Article(f'{url}','id')
    article.download()
    article.parse()
    authors = ", ".join(article.authors)
    title = article.title
    publish_date = article.publish_date.strftime("%Y-%m-%d %H:%M")
    meta_site_name = article.meta_site_name
    meta_description = article.meta_description
    meta_keywords = ", ".join(article.meta_keywords)
    text = title+'\n'+article.text
    # summary = article.summary
    return {
            "authors" : authors,
            "title" : title,
            "publish_date" : publish_date,
            "meta_site_name" : meta_site_name,
            "meta_description" : meta_description,
            "meta_keywords" : meta_keywords,
            "text" : text,
            # "summary" : summary,
        }

def get_all_news(urls):
    start_time = timeit.default_timer()
    semaphore = threading.BoundedSemaphore(4)
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        future_task = [executor.submit(get_news, url=url) for url in urls] 
        result_task = [future.result() for future in concurrent.futures.as_completed(future_task)]
    end_time = timeit.default_timer()
    # result_task = list(set(np.array(result_task).flatten().tolist())) # get unique
    news_all = []
    for news in result_task:
        news_all.extend(news)
    news_all = list(set(np.array(news_all)))
    print(f"Finished collect all url, total : {len(news_all)}, {end_time - start_time:.2f}s")
    return news_all

In [275]:
df_news = pd.DataFrame(get_all_news(uniq_urls))
df_news.to_parquet('detiknews.parquet',engine='fastparquet')
df_news.info()