In [1]:
import mysql.connector

In [2]:
import httpx
import asyncio
import datetime
import pandas as pd
import streamlit as st
from selectolax.parser import HTMLParser
from typing import List, Dict, Union
import mysql.connector

In [4]:
async def fetch_page(url: str, params: dict, headers: dict) -> Union[str, None]:
    """Fetch a webpage content with error handling."""
    async with httpx.AsyncClient() as client:
        try:
            response = await client.get(url, params=params, headers=headers, timeout=10.0)
            return response.text
        except httpx.TimeoutException:
            st.error(f"Timeout: unable to connect to {url}. Please try again.")
            return None
        except httpx.HTTPStatusError as e:
            st.error(f"HTTP error: {str(e)}")
            return None

In [5]:
async def parse_content(url: str) -> str:
    """Extract content from a given URL."""
    html = await fetch_page(url, {}, {})
    # if not html:
    #     return "Error fetching content."

    parser = HTMLParser(html)
    paragraphs = [p.text() for p in parser.css('div.detail__body-text > p')]
    return "\n".join(paragraphs) if paragraphs else "No content available."

In [28]:
# url = "https://lampung.tribunnews.com/search?q=&cx=partner-pub-016364a8b29784e5c&cof=FORID%3A10&ie=UTF-8&siteurl=www.tribunnews.com#gsc.tab=0"
# url = "https://www.detik.com/search/searchall?"
url = "https://lampung.tribunnews.com/search?q=dtsen&cx=partner-pub-016364a8b29784e5c&cof=FORID%3A10&ie=UTF-8&siteurl=www.tribunnews.com#gsc.tab=0&gsc.q=dtsen&gsc.page=1"
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/244.178.44.111 Safari/537.36",
    }


In [29]:
# html = await fetch_page(url, {}, {})
html = await fetch_page(url, 'dtsen', headers)
html

'<!DOCTYPE html>\n<html lang="id-ID" itemscope="itemscope" itemtype="https://schema.org/WebPage">\n<head>\n<title>Hasil pencarian untuk  - Tribunlampung.co.id</title>\n<link href="//securepubads.g.doubleclick.net" rel="dns-prefetch">\n<link href="//asset-1.tribunnews.com" rel="dns-prefetch">\n<link href="//asset-2.tribunnews.com" rel="dns-prefetch">\n<link href="//asset-3.tribunnews.com" rel="dns-prefetch">\n<link href="//tpc.googlesyndication.com" rel="dns-prefetch">\n<link href="//cm.g.doubleclick.net" rel="dns-prefetch">\n<link href="//lampung.tribunnews.com" rel="dns-prefetch">\n<link rel="dns-prefetch" href="https://www.facebook.com">\n<link rel="preconnect" href="https://www.googletagmanager.com">\n<link rel="preconnect" href="https://www.google-analytics.com">\n<link rel="preload" href="https://asset-1.tribunnews.com/css/theme25/network/web-daerah-250721.min.css" as="style">\n<link rel="preload" href="//maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" as="sty

In [25]:
parser = HTMLParser(html)
parser

<HTMLParser chars=91726>

In [22]:
search_results = parser.css('article')
search_results

[]

In [23]:
paragraphs = [p.text() for p in parser.css('div.detail__body-text > p')]
paragraphs

[]

In [7]:
async def parse_item(result) -> Dict[str, str]:
    """Extract information from a single search result."""
    title = result.css_first('h3.media__title').text()
    date = result.css_first('.media__date > span').attrs['title']
    url = result.css_first('a').attrs['href']
    desc_element = result.css_first('div.media__desc')
    desc = desc_element.text() if desc_element else "No description"

    # Fetch content for each item
    content = await parse_content(url)

    return {
        'title': title,
        'url': url,
        'date': date,
        'desc': desc,
        'content': content
    }


In [8]:
async def parse(url: str, params: dict, headers: dict) -> List[Dict[str, str]]:
    """Parse search results from the page and extract details."""
    html = await fetch_page(url, params, headers)
    if not html:
        return []

    parser = HTMLParser(html)
    search_results = parser.css('article')

    # Parse each result concurrently
    return await asyncio.gather(*[parse_item(result) for result in search_results])


In [9]:
async def fetch_json(url: str, headers: dict = None) -> Union[Dict, None]:
    """Fetch JSON data from the provided URL with error handling."""
    async with httpx.AsyncClient() as client:
        try:
            response = await client.get(url, headers=headers, timeout=10.0)
            return response.json()
        except httpx.TimeoutException:
            st.error(f"Timeout: unable to connect to {url}. Please try again.")
            return None
        except httpx.HTTPStatusError as e:
            st.error(f"HTTP error: {str(e)}")
            return None


In [11]:
def insert_news_to_db(data: Dict[str, str]):
    """Insert scraped news item into MySQL database."""
    try:
        # Koneksi ke database
        conn = mysql.connector.connect(
            host="localhost",        # ganti sesuai server
            user="root",             # ganti username MySQL
            password="",             # ganti password MySQL
            database="siger-lampung" # database tujuan
        )
        cursor = conn.cursor()

        # Query insert
        query = """
        INSERT INTO news (nama, tanggal_berita, tanggal_update, link)
        VALUES (%s, %s, %s, %s)
        """

        # Konversi tanggal_berita dari string ke format DATE MySQL
        try:
            tanggal_berita = datetime.datetime.strptime(data["date"], "%d %B %Y").date()
        except ValueError:
            # fallback kalau format tanggal beda
            tanggal_berita = datetime.date.today()

        tanggal_update = datetime.date.today()

        values = (
            data["title"],           # nama
            tanggal_berita,          # tanggal_berita
            tanggal_update,          # tanggal_update
            data["url"]              # link
        )

        # Eksekusi query
        cursor.execute(query, values)
        conn.commit()

        print(f"Berhasil insert: {data['title']}")
    except mysql.connector.Error as err:
        print(f"Error: {err}")
    finally:
        if conn.is_connected():
            cursor.close()
            conn.close()

In [31]:
async def parse(url: str, params: dict, headers: dict) -> List[Dict[str, str]]:
    """Parse search results from the page and extract details."""
    html = await fetch_page(url, params, headers)
    if not html:
        return []

    parser = HTMLParser(html)
    search_results = parser.css('article')

    # Parse each result concurrently
    results = await asyncio.gather(*[parse_item(result) for result in search_results])

    # Setelah semua selesai, masukkan ke DB
    for item in results:
        insert_news_to_db(item)

    return results


In [27]:
search_url = "https://www.detik.com/search/searchall?"
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/244.178.44.111 Safari/537.36",
    }
params = {"query": "dtsen", "page": 1, }

In [None]:
async def main():
    search_url = "https://www.detik.com/search/searchall?"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/244.178.44.111 Safari/537.36",
    }
    params = {"query": "dtsen", "page": 1}

    # Jalankan parsing
    results = await parse(search_url, params, headers)

    print(f"{len(results)} berita berhasil diproses dan dimasukkan ke DB.")


In [32]:
search_url = "https://www.detik.com/search/searchall?"
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/244.178.44.111 Safari/537.36",
}
params = {"query": "dtsen", "page": 1}

# Jalankan parsing
results = await parse(search_url, params, headers)

print(f"{len(results)} berita berhasil diproses dan dimasukkan ke DB.")

Berhasil insert: 
            Rakor Akselerasi Pembangunan NTT, Gus Ipul Tekankan Pedoman DTSEN
         
Berhasil insert: 
            Apa Bedanya DTKS dan DTSEN? Simak Juga Cara Daftarnya
         
Berhasil insert: 
            Cara Cek Status Pendaftaran DTKS, Kini Sudah Berganti Jadi DTSEN
         
Berhasil insert: 
            Video: Menag Pastikan Dana Zakat akan Tepat Sasaran Lewat DTSEN
         
Berhasil insert: 
            Kemendagri Dukung Akselerasi Program 3 Juta Rumah Lewat Integrasi DTSEN
         
Berhasil insert: 
            Wamensos Ajak Pemda Jadikan DTSEN Landasan Program Pengentasan Kemiskinan
         
Berhasil insert: 
            Bansos Tahap II Siap Disalurkan, Mengacu pada DTSEN
         
Berhasil insert: 
            DTSEN Rampung. Mensos Sebut Kolaborasi Banyak Kementerian
         
Berhasil insert: 
            Kemensos & BPS Mutakhirkan DTSEN untuk Penyaluran Bansos Triwulan II
         
Berhasil insert: 
            DTKS Resmi Diganti DTSEN, Penyaluran

In [14]:
items = await parse(search_url, params, headers)

In [20]:
items[0]

{'title': '\n            Rakor Akselerasi Pembangunan NTT, Gus Ipul Tekankan Pedoman DTSEN\n         ',
 'url': 'https://news.detik.com/berita/d-8047705/rakor-akselerasi-pembangunan-ntt-gus-ipul-tekankan-pedoman-dtsen',
 'date': 'Rabu, 06 Agu 2025 14:10 WIB',
 'desc': '\n            Gus Ipul menekankan pentingnya penyusunan program yang berpedoman pada Data Tunggal Sosial Ekonomi Nasional (DTSEN).         ',
 'content': 'Menteri Sosial Saifullah Yusuf (Gus Ipul) menekankan pentingnya penyusunan program yang berpedoman pada Data Tunggal Sosial Ekonomi Nasional (DTSEN) serta perubahan paradigma dari pemberian bantuan sosial ke pemberdayaan.\nHal ini disampaikan Gus Ipul saat menghadiri secara virtual Rapat Koordinasi Percepatan Implementasi Program Pemerintah Pusat di Provinsi Nusa Tenggara Timur (NTT). Rapat yang digelar secara hybrid ini dihadiri oleh berbagai perwakilan Kementerian dan Lembaga serta seluruh jajaran Pemerintah Kabupaten/Kota di Provinsi NTT.\n"Kami ingin mengajak Bapak

In [None]:
async def main():
    """Main Streamlit application function."""
    search_url = "https://www.detik.com/search/searchall?"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/244.178.44.111 Safari/537.36",
    }

    # Streamlit container to group the input elements
    with st.container():
        st.title("Scraping Berita DTSEN")

        keyword = st.text_input("Search keyword")

        # Total pages and export options
        pages = int(st.text_input("Total Pages", value="1"))
        export_format = st.selectbox("Export to", ["CSV", "XLSX", "JSON"])

        # Disable Scrape button when keyword is empty
        scrape_button = st.button("Scrape", disabled=not keyword)

    if scrape_button and keyword:
        with st.spinner(f"Scraping results for '{keyword}'..."):
            now = datetime.datetime.now()
            formatted_time = now.strftime("%Y%m%d_%H%M%S")

            params = {
                "query": keyword,
                "page": 1,  # Placeholder, will be adjusted in loop
            }

            # Collect data for all pages asynchronously
            all_items = []
            for page in range(1, pages + 1):
                params['page'] = page
                items = await parse(search_url, params, headers)
                all_items.extend(items)

            if all_items:
                data = pd.DataFrame(all_items)
                data.index += 1
                st.dataframe(data)

                file_name = f"{formatted_time}_{keyword}_{pages}.{export_format.lower()}"
                
                if export_format == "CSV":
                    csv_data = data.to_csv(index=False)
                    st.download_button("Download CSV", data=csv_data, file_name=file_name, mime="text/csv")
                elif export_format == "XLSX":
                    xlsx_data = data.to_excel(index=False, engine='openpyxl')
                    st.download_button("Download XLSX", data=xlsx_data, file_name=file_name, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
                else:  # JSON
                    json_data = data.to_json(orient='records')
                    st.download_button("Download JSON", data=json_data, file_name=file_name, mime="application/json")
                
                st.success("Scraping completed!")
            else:
                st.error("No data scraped.")


In [22]:
if __name__ == '__main__':
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [31]:
pip install selenium

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting cffi>=1.14 (from trio~=0.30.0->selenium)
  Downloading cffi-1.17.1-cp310-cp310-win_amd64.whl.metadata (1.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]~=2.5.

In [33]:
import re
import string
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from random import randint
from time import sleep
import pandas as pd

In [35]:
pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.14.2
Note: you may need to restart the kernel to use updated packages.


In [36]:
import urllib3
import pandas as pd
import xmltodict

In [38]:
http = urllib3.PoolManager()

In [40]:
url = "https://lampung.tribunnews.com/search?q=dtsen&cx=partner-pub-016364a8b29784e5c&cof=FORID%3A10&ie=UTF-8&siteurl=www.tribunnews.com#gsc.tab=0&gsc.q=dtsen&gsc.page=1" 
    
resp = http.request('GET', url)


In [43]:
resp.data

b'\n<!DOCTYPE html>\n<html lang=en>\n  <meta charset="UTF-8">\n  <meta name="viewport" content="width=device-width">\n  <title>TribunNews.com - 403</title>\n  <link rel="shortcut icon" href="https://asset-1.tstatic.net/img/icon/tribun-icon_32.png" />\n  <link rel="apple-touch-icon-precomposed" href="https://asset-1.tstatic.net/img/icon/tribun-icon_128.png"/>\n  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">\n  <style>\n@import url("https://fonts.googleapis.com/css?family=Oswald:400,700");* {box-sizing: border-box;margin: 0;padding: 0;}html {height: 100%;}body {background: #fff;font-family: "Oswald", sans-serif;}a {color: #016fba;text-decoration: none;}.container {position: absolute;top: 50%;left: 50%;transform: translate(-50%, -50%);display: flex;flex-direction: column;align-items: center;}@media (max-width: 650px) {.container {width: 85%;}}.container .header {color: #fb3958;font-size: 4em;font-weight: 700;text-align: center;t

In [45]:
parser = HTMLParser(resp.data)

In [None]:
parser.data

<HTMLParser chars=6600>

In [47]:
paragraphs = [p.text() for p in parser.css('div.detail__body-text > p')]

In [51]:
parser.css('div.detail__body-text > p')

[]

In [42]:
data = xmltodict.parse(resp.data)

ExpatError: not well-formed (invalid token): line 3, column 11

In [52]:
pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4

   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   ---------------------------------------- 2/2 [beautifulsoup4]

Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [53]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://lampung.tribunnews.com/search?q=dtsen&cx=partner-pub-016364a8b29784e5c&cof=FORID%3A10&ie=UTF-8&siteurl=www.tribunnews.com#gsc.tab=0&gsc.q=dtsen&gsc.page=1"
res = requests.get(url)
html = BeautifulSoup(res.content, "html.parser")
# divs = rows.findAll("div", class_ = "bsh ovh ")

NameError: name 'rows' is not defined

In [56]:
html


<!DOCTYPE html>

<html lang="en">
<meta charset="utf-8"/>
<meta content="width=device-width" name="viewport"/>
<title>TribunNews.com - 403</title>
<link href="https://asset-1.tstatic.net/img/icon/tribun-icon_32.png" rel="shortcut icon"/>
<link href="https://asset-1.tstatic.net/img/icon/tribun-icon_128.png" rel="apple-touch-icon-precomposed"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
<style>
@import url("https://fonts.googleapis.com/css?family=Oswald:400,700");* {box-sizing: border-box;margin: 0;padding: 0;}html {height: 100%;}body {background: #fff;font-family: "Oswald", sans-serif;}a {color: #016fba;text-decoration: none;}.container {position: absolute;top: 50%;left: 50%;transform: translate(-50%, -50%);display: flex;flex-direction: column;align-items: center;}@media (max-width: 650px) {.container {width: 85%;}}.container .header {color: #fb3958;font-size: 4em;font-weight: 700;text-align: center;text-shadow: 2px 2px 5