In [570]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import time
import feedparser
from tqdm import tqdm
import random
import numpy as np
import re
from urllib.parse import urlparse
import sqlite3

import warnings
warnings.filterwarnings("ignore")

In [571]:
db_path = '../data/db/stock_news.db'
conn = sqlite3.connect(db_path)

# CNBC

In [560]:
last_date = pd.read_sql("SELECT MAX(published) AS last_date FROM news WHERE SOURCE = 'cnbc'", conn)["last_date"].iloc[0]

In [562]:
last_date = pd.to_datetime(last_date)

In [563]:
last_date

Timestamp('2025-08-08 20:20:10+0700', tz='UTC+07:00')

In [564]:
rss_url = "https://www.cnbcindonesia.com/market/rss"
feed = feedparser.parse(rss_url)
data = []
for entry in feed.entries:
    data.append({
        "title": entry.title,
        "link": entry.link,
        "published": entry.published
    })

df = pd.DataFrame(data)
df["published"] = pd.to_datetime(df["published"])

In [568]:
df = df[df['published']>last_date]

In [569]:
df.head()

Unnamed: 0,title,link,published
0,"Perkara Modal Minim, OJK Pantau Ketat 11 Pindar",https://www.cnbcindonesia.com/market/202508100...,2025-08-10 08:45:17+07:00
1,Cek! Daftar 22 Bank Perkreditan Rakyat RI yang...,https://www.cnbcindonesia.com/market/202508100...,2025-08-10 08:15:38+07:00
2,"Cek 5 Orang Terkaya RI per Agustus 2025, Ada M...",https://www.cnbcindonesia.com/market/202508100...,2025-08-10 07:40:40+07:00
3,"Perusahaan Segera IPO, JK Bakal Raup Rp 162 Tr...",https://www.cnbcindonesia.com/market/202508100...,2025-08-10 07:20:34+07:00
4,"Setelah Rekening Bank, PPATK Pantau E-Wallet N...",https://www.cnbcindonesia.com/market/202508100...,2025-08-10 06:40:32+07:00


In [120]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    link= row['link']
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    detail_div = soup.find("div", class_="detail-text")
    paragraphs = detail_div.find_all("p")
    content = " ".join([p.get_text() for p in paragraphs])
    df.at[i, 'content'] = content

    wait_time = random.uniform(1, 25)
    time.sleep(wait_time)

100%|██████████| 100/100 [22:28<00:00, 13.49s/it]


In [393]:
def clean_invisible_spaces(text):
    return re.sub(r'[\u00A0\u2000-\u200B\u202F\u205F\u3000]', ' ', text).strip()

def clean_text(text):
    text = clean_invisible_spaces(text)
    text = re.sub('Jakarta, CNBC Indonesia', '', text)
    text = re.sub(r'- |— |IDXChannel—|KONTAN.CO.ID JAKARTA.|Bisnis.com , JAKARTA|JAKARTA, investor.id', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

exclude_phrases = [
    'Reporter', 
    'Editor', 
    'Baca Juga', 
    'Cek Berita dan Artikel yang lain', 
    'Menarik Dibaca', 
    'Selanjutnya:'
]

In [143]:
df['content'] = df['content'].apply(clean_text)

In [144]:
df['source'] = 'CNBC'

In [145]:
df.to_csv("cnbc_news.csv", index=False)

# KONTAN

In [587]:
conn = sqlite3.connect(db_path)
query = f"""
            SELECT MAX(published) AS last_date
            FROM news
            WHERE source = 'kontan'
        """
last_date = pd.read_sql(query, conn)["last_date"].iloc[0]
last_date = pd.to_datetime(last_date)
last_date = last_date.strftime('%Y-%m-%d 00:00:00')
query = f"""
            SELECT DISTINCT link
            FROM news
            WHERE DATETIME(published) >= ?
            AND source = 'kontan'
        """
df = pd.read_sql(query, conn, params=(last_date,))

In [588]:
last_date

'2025-08-09 00:00:00'

In [589]:
df

Unnamed: 0,link
0,https://investasi.kontan.co.id/news/ihsg-mengu...
1,https://investasi.kontan.co.id/news/harga-emas...
2,https://investasi.kontan.co.id/news/ihsg-ditut...
3,https://investasi.kontan.co.id/news/makmur-ber...
4,https://investasi.kontan.co.id/news/18-agustus...
5,https://investasi.kontan.co.id/news/ihsg-melem...
6,https://investasi.kontan.co.id/news/cek-saham-...
7,https://investasi.kontan.co.id/news/emiten-yan...
8,https://investasi.kontan.co.id/news/wall-stree...
9,https://investasi.kontan.co.id/news/rupiah-men...


In [577]:
last_date

Timestamp('2025-08-09 00:00:00')

In [159]:
base_url = "https://www.kontan.co.id/search/indeks?kanal=investasi&tanggal={day}&bulan={month}&tahun={year}&pos=indeks&per_page={per_page}"
def get_kontan_news(day, month, year, per_page=''):
    url = base_url.format(day=day, month=month, year=year, per_page=per_page)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    return soup

In [None]:
dates = pd.date_range(start='2025-08-01', end='2025-08-09', freq='D')
dates_links = {}
for date in dates:
    links = []
    day = date.strftime('%d')
    month = date.strftime('%m')
    year = date.strftime('%Y')
    per_page = 0
    while True:
        soup = get_kontan_news(day, month, year, per_page=str(per_page))
        if not soup.find('div', class_='list-berita'):
            break
        if not soup.find('div', class_='list-berita').find_all('div', class_='sp-hl linkto-black'):
            break
        for soup_link in soup.find('div', class_='list-berita').find_all('div', class_='sp-hl linkto-black'):
            link = soup_link.find('a')['href']
            links.append(link)
        per_page += 20
    dates_links[date] = links

In [181]:
df = pd.DataFrame(columns=['date', 'published'])
for date, links in dates_links.items():
    df = pd.concat([df, pd.DataFrame({'date': date, 'published': links})], ignore_index=True)

  df = pd.concat([df, pd.DataFrame({'date': date, 'published': links})], ignore_index=True)


In [222]:

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    link = row['published']
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    df.loc[i, 'title'] = soup.find('title').get_text()
    element = soup.find(attrs={"itemprop": 'articleBody'})
    if element:
        paragraphs = element.find_all('p')
        if paragraphs:
            content = " ".join(
                p.get_text()
                for p in paragraphs
                if not any(phrase in p.get_text() for phrase in exclude_phrases)
            )
            content = clean_text(content)
            df.loc[i, 'content'] = content
    
    wait_time = random.uniform(1, 25)
    time.sleep(wait_time)

100%|██████████| 447/447 [1:59:07<00:00, 15.99s/it]    


In [223]:
df.isna().sum()

date         0
published    0
title        0
content      0
dtype: int64

In [224]:
df.head()

Unnamed: 0,date,published,title,content
0,2025-08-01,https://investasi.kontan.co.id/news/asing-net-...,"Asing Net Sell Saat IHSG Menguat, Cek Saham ya...",KONTAN.CO.ID JAKARTA. Indeks Harga Saham Gabun...
1,2025-08-01,https://investasi.kontan.co.id/news/ihsg-mengh...,"IHSG Menghijau, Cermati Saham yang Banyak Dibo...",KONTAN.CO.ID JAKARTA. Indeks Harga Saham Gabun...
2,2025-08-01,https://investasi.kontan.co.id/news/saham-coco...,Saham COCO dan BUVA Disuspensi pada Jumat (1/8...,KONTAN.CO.ID JAKARTA. PT Bursa Efek Indonesia ...
3,2025-08-01,https://investasi.kontan.co.id/news/wall-stree...,"Wall Street Anjlok, Dipicu Tarif Trump dan Kin...",KONTAN.CO.ID NEW YORK. Indeks utama Wall Stree...
4,2025-08-01,https://investasi.kontan.co.id/news/komisaris-...,"Komisaris Tak Dapat Tantiem dan Bonus, Begini ...",KONTAN.CO.ID JAKARTA. Kebijakan Badan Pengelol...


In [225]:
df.columns = ['published', 'link', 'title', 'content']

In [226]:
df.to_csv('kontan_news.csv', index=False)

# BISNIS.COM

In [279]:
base_url = 'https://www.bisnis.com/index?categoryId=194&date={date}&type=indeks&page={page}'
def get_bisnis_news(date, page=1):
    url = base_url.format(date=date, page=page)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    return soup

def extract_link_and_title(soup):
    links = []
    titles = []
    container = soup.find("div", id="indeksListView")
    if not container:
        return links, titles
    elements = container.find_all('div', class_='artContent')
    if not elements:
        return links, titles
    for element in elements:
        link = element.find('a',class_='artLink')['href']
        title = element.find('a',class_='artLink').find(class_='artTitle').get_text(strip=True)
        links.append(link)
        titles.append(title)
    return links, titles

df_list = []
dates = pd.date_range(start='2025-08-01', end='2025-08-09', freq='D')
for date in tqdm(dates):
    page = 1
    df_current_date = pd.DataFrame(columns=['published', 'link', 'title'])
    while True:
        soup = get_bisnis_news(date.strftime('%Y-%m-%d'), page)
        if not soup:
            break
        links, titles = extract_link_and_title(soup)
        if not links or not titles:
            break
        df_current_page = pd.DataFrame({'published': date, 'link': links, 'title': titles})
        if df_current_page['link'].isin(df_current_date['link']).any():
            break
        df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
        page += 1
    df_list.append(df_current_date)
df = pd.concat(df_list, ignore_index=True)

  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
  df_current_date = pd.concat([df_current_date, df_current_page], ignore_index=True)
100%|██████████| 9/9 [00:19<00:00,  2.19s/it]


In [292]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    link = row['link']
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    paragraphs  = soup.find('article', class_='detailsContent').find_all('p')
    content = clean_text(' '.join([
        clean_text(p.get_text(separator=" ", strip=True))
        for p in paragraphs
        if p.get_text(strip=True) and not p.get_text(strip=True).startswith('#')
    ]))
    df.at[i, 'content'] = content
    wait_time = random.uniform(1, 25)
    time.sleep(wait_time)

100%|██████████| 223/223 [51:02<00:00, 13.73s/it]


In [293]:
df.head()

Unnamed: 0,published,link,title,content
0,2025-08-01,https://market.bisnis.com/read/20250801/192/18...,Rahasia Nasabah Binaan PNM Mekaar Dilirik Bran...,"Bisnis.com , JAKARTA Kisah Ibu Umi Haryanti, n..."
1,2025-08-01,https://market.bisnis.com/read/20250801/93/189...,"Dolar AS Perkasa, Euro Jatuh Imbas Kesepakatan...","Bisnis.com , JAKARTA Mata uang euro mengalami ..."
2,2025-08-01,https://market.bisnis.com/read/20250801/192/18...,"Intiland (DILD) Cetak Marketing Sales Rp673,4 ...","Bisnis.com , JAKARTA Emiten properti PT Intila..."
3,2025-08-01,https://market.bisnis.com/read/20250801/192/18...,"Bumi Resources (BUMI) Produksi 35,9 Juta Ton B...","Bisnis.com , JAKARTA Emiten tambang batu bara ..."
4,2025-08-01,https://market.bisnis.com/read/20250801/192/18...,"Intiland (DILD) Bukukan Pendapatan Rp1,21 Tril...","Bisnis.com , JAKARTA Emiten properti PT Intila..."


In [294]:
df.isna().sum()

published    0
link         0
title        0
content      0
dtype: int64

In [295]:
df.to_csv('bisnis_news.csv', index=False)

# IDX CHANNEL

In [443]:
rss_url = "https://www.idxchannel.com/rss"
feed = feedparser.parse(rss_url)
data = []
for entry in feed.entries:
    data.append({
        "title": entry.title,
        "link": entry.link,
        "published": entry.published
    })

df = pd.DataFrame(data)
df["published"] = pd.to_datetime(df["published"])
df['category'] = df['link'].apply(lambda u: urlparse(u).path.strip("/").split("/")[0])

In [444]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    if row['category'] not in ['market-news']:
        continue
    link= row['link']
    page = 1
    whole_content = ""
    while True:
        link_with_page = f"{link}/{page}"
        resp = requests.get(link_with_page)
        soup = BeautifulSoup(resp.text, "html.parser")
        container1 = soup.find('div', class_='article--content')
        if not container1:
            break
        container2 = container1.find('div', class_='content')
        if not container2:
            break
        paragraphs = container2.find_all('p')
        content = " ".join([p.get_text() for p in paragraphs])
        content = clean_text(content)
        whole_content += " " + content
        page += 1
        
    df.at[i, 'content'] = whole_content.strip()

100%|██████████| 10/10 [00:01<00:00,  7.97it/s]


In [446]:
df.to_csv("idxchannel_news2.csv", index=False)

# PASARDANA

In [461]:
rss_url = "https://pasardana.id/rss"
feed = feedparser.parse(rss_url)
data = []
for entry in feed.entries:
    data.append({
        "title": entry.title,
        "link": entry.link,
        "published": entry.published
    })

df = pd.DataFrame(data)
df["published"] = pd.to_datetime(df["published"])

In [471]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    link = row['link']
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    paragraphs = soup.find('section', class_='entry-content').find_all('p')
    content = " ".join([p.get_text() for p in paragraphs])
    df.at[i, 'content'] = content
    
    wait_time = random.uniform(1, 15)
    time.sleep(wait_time)

100%|██████████| 30/30 [05:17<00:00, 10.57s/it]


In [473]:
df.to_csv('investor_id_news.csv', index=False)

# IQPLUS

In [474]:
base_url = "http://www.iqplus.info/news/stock_news/go-to-page,{page}.html"

In [496]:
headers

{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36',
 'Accept-Language': 'en-US,en;q=0.9',
 'Referer': 'https://www.idxchannel.com/',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}

In [497]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36',
 'Accept-Language': 'en-US,en;q=0.9',
 'Referer': 'http://www.iqplus.info',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}

In [526]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Referer': 'http://www.iqplus.info'
}
df = pd.DataFrame(columns=['published', 'link', 'title'])
ajax_url = "http://www.iqplus.info/box_listnews_more.php?csection=stock_news&id={page}"
page = 1
while True:
    url = ajax_url.format(page=page)
    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        break
    soup = BeautifulSoup(resp.text, "html.parser")
    lis = soup.find_all('li', style='text-transform:capitalize;')
    if not lis:
        break
    df_current_page = {'published' : [], 'link': [], 'title': []}
    for li in lis:
        published = li.find("b").get_text(strip=True)
        a_tag = li.find("a")
        link = a_tag["href"]
        title = a_tag.get_text(strip=True)
        df_current_page['published'].append(published)
        df_current_page['link'].append(link)
        df_current_page['title'].append(title)
    df_current_page = pd.DataFrame(df_current_page)
    df_current_page['published'] = df_current_page['published'].apply(lambda x: datetime.strptime(x, "%d/%m/%y - %H:%M"))
    if df_current_page['published'].min().normalize() < datetime(2025, 8, 1):
        break
    df = pd.concat([df, df_current_page], ignore_index=True)
    wait_time = random.uniform(1, 10)
    time.sleep(wait_time)
    page += 1


In [528]:
df.to_csv('iqplus_news.csv', index=False)

In [540]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    link = row['link']
    resp = requests.get(link, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    container1 = soup.find('div', class_='midcol')
    if not container1:
        continue
    zoom_div = soup.find("div", id="zoomthis")
    if not zoom_div:
        continue
    for tag in zoom_div.find_all(["small", "h3"]):
        tag.decompose()  # menghapus dari tree
    news_content = zoom_div.get_text(separator="\n", strip=True)
    news_content = clean_text(news_content)
    df.at[i, 'content'] = news_content
    wait_time = random.uniform(1, 20)
    time.sleep(wait_time)

100%|██████████| 300/300 [1:00:32<00:00, 12.11s/it]


In [541]:
df.head()

Unnamed: 0,published,link,title,content
0,2025-08-08 04:43:00,http://www.iqplus.info/news/stock_news/bjtm-ba...,BJTM: BANK JATIM-BANK LAMPUNG TEKEN MOU INTEGR...,"IQPlus, (8/8) Bank Jatim dan Bank Lampung mene..."
1,2025-08-08 04:39:00,http://www.iqplus.info/news/stock_news/batr-ri...,BATR: RIDWAN TAMBAH KEPEMILIKAN SAHAM BATR,"IQPlus, (8/8) Ridwan selaku Direktur utama PT ..."
2,2025-08-08 04:36:00,http://www.iqplus.info/news/stock_news/widi-wi...,WIDI: WIDI LAKUKAN KEGIATAN CSR DI JAKARTA BARAT,"IQPlus, (8/8) PT Widiant Jaya Krenindo Tbk mel..."
3,2025-08-08 04:23:00,http://www.iqplus.info/news/stock_news/tebe-te...,TEBE: TEBE PEROLEH HAK KONSESI UNTUK JASA KEPE...,"IQPlus, (8/8) PT Dana Brata Luhur Tbk (TEBE) t..."
4,2025-08-08 03:49:00,http://www.iqplus.info/news/stock_news/pipa-mo...,"PIPA: MORRIS CAPITAL AMBIL ALIH 5,1% SAHAM PIPA","IQPlus, (8/8) PT Morris Capital Indonesia tela..."


In [542]:
df.isna().sum()

published    0
link         0
title        0
content      0
dtype: int64

In [555]:
df.sample(5)

Unnamed: 0,published,link,title,content
169,2025-08-05 09:35:00,http://www.iqplus.info/news/stock_news/bris-bs...,BRIS: BSI MASLAHAT DUKUNG PEMBANGUNAN SOSIAL Y...,"IQPlus, (5/8) BSI Maslahat terus berupaya dala..."
86,2025-08-07 08:14:00,http://www.iqplus.info/news/stock_news/irsx-be...,IRSX: BEI BUKA KEMBALI PERDAGANGAN SAHAM IRSX,"IQPlus, (7/8) Bursa Efek Indonesia (BEI) membu..."
75,2025-08-07 09:24:00,http://www.iqplus.info/news/stock_news/aman-pr...,AMAN: PROGRES PEMBANGUNAN PABRIK AMAN MASIH BE...,"IQPlus, (7/8) PT Makmur Berkah Amanda Tbk (AMA..."
63,2025-08-07 11:43:00,http://www.iqplus.info/news/stock_news/jsmr-ja...,JSMR: JASA MARGA RAIH PENGHARGAAN INDONESIA BE...,"IQPlus, (7/8) PT Jasa Marga (Persero) Tbk yang..."
77,2025-08-07 09:05:00,http://www.iqplus.info/news/stock_news/aman-am...,AMAN: AMAN KLAIM MILIKI 894 UNIT PROPERTI DI K...,"IQPlus, (7/8) PT Makmur Berkah Amanda Tbk (AMA..."


In [556]:
df.to_csv('iqplus_news.csv', index=False)

In [537]:
zoom_div = soup.find('div', class_='midcol').find("div", id="zoomthis")

# Ambil teks bersih tanpa tag HTML
news_content = zoom_div.get_text(separator="\n", strip=True)

In [539]:
clean_text(news_content)

'Friday 08/Aug/2025 at 16:43 BANK JATIM-BANK LAMPUNG TEKEN MOU INTEGRASI KUB DUKUNG EKONOMI DAERAH. IQPlus, (8/8) Bank Jatim dan Bank Lampung meneken nota kesepahaman (MoU) terkait perjanjian penyertaan dan pengambilalihan saham bersyarat Conditional Shares Subscription and Acquisition Agreement (CSSA) sebagai bagian dari integrasi Kelompok Usaha Bank (KUB) yang disaksikan gubernur kedua provinsi itu. "Yang menjadi penguat misi dagang kali ini karena ada MoU KUB antara Bank Jatim dan Bank Lampung. Saya rasa ini akan membangun konektivitas dunia perbankan, keuangan dan ekonomi antar daerah,. kata Gubernur Jatim Khofifah Indar Parawansa dalam keterangan diterima di Surabaya, Jawa Timur, Jumat. Penandatanganan dilakukan oleh Plt. Direktur Utama Bank Jatim Arif Suhirman dan Direktur Utama Bank Lampung Mahdi Yusuf dalam rangkaian Misi Dagang dan Investasi Jawa Timur.Lampung. Bank Jatim akan memberikan penyertaan modal senilai Rp100 miliar kepada Bank Lampung. Khofifah menegaskan penandatang