Code History:
1. Version 1.0 (2023/03/09):
    - Base version, working as expected

<strong>Features:</strong>
- Scrape corporate and government bonds summary and details

Data is scraped <strong>every weekday on 6PM GMT+7</strong>, few hours after the market has closed for the day. So the data you see before 6PM is previous trading day data.

In [1]:
import json
from json.decoder import JSONDecodeError
import numpy as np
import pandas as pd
import concurrent.futures
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import queue
import threading
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import dateparser

# Chrome Selenium Starter

Why Selenium? Because I need it to bypass cloudfare restriction

In [2]:
# Initialize the Chrome driver
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

# Scrape Bond Summary

## BEI Bonds List

In [3]:
urls = {
    'Corporate Bond':'https://www.idx.co.id/secondary/get/BondSukuk/bond?pageSize=10000&indexFrom=1&bondType=1',
    'Goverment Bond':'https://www.idx.co.id/secondary/get/BondSukuk/bond?pageSize=10000&indexFrom=1&bondType=2'  
}

In [4]:
BEIBondsListDF = pd.DataFrame()
for issuer_type in urls:
    print(issuer_type)
    driver.get(urls[issuer_type])
    WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
    BEIBondsListContent = driver.find_element(By.TAG_NAME, value='body').text
    BEIBondsTypeListDF = pd.DataFrame(json.loads(BEIBondsListContent)['Results']).drop(columns='Nomor')
    BEIBondsTypeListDF['IssuerType'] = issuer_type
    
    BEIBondsListDF = pd.concat([BEIBondsListDF, BEIBondsTypeListDF])
    
BEIBondsListDF['MatureDate'] = pd.to_datetime(BEIBondsListDF['MatureDate']).dt.normalize()

Corporate Bond
Goverment Bond


In [5]:
BEIBondsListDF

Unnamed: 0,BondId,BondName,IssuerCode,MatureDate,Rating,Outstanding,IssuerType
0,ABLS01XXMF,MTN Asian Bulk Logistics I Tahun 2022,ABLS-M,2027-06-21,,1.000000e+12,Corporate Bond
1,ABSM01C,Obligasi AB Sinar Mas Multifinance I Tahun 202...,ABSM,2023-09-04,irA-,3.500000e+10,Corporate Bond
2,ADCP01B,Obligasi I Adhi Cummuter Properti Tahun 2021 S...,ADCP,2024-05-20,idBBB,9.000000e+09,Corporate Bond
3,ADCP02A,Obligasi II Adhi Commuter Properti Tahun 2022 ...,ADCP,2023-05-31,idBBB,2.055000e+11,Corporate Bond
4,ADCP02B,Obligasi II Adhi Commuter Properti Tahun 2022 ...,ADCP,2025-05-24,idBBB,1.020000e+11,Corporate Bond
...,...,...,...,...,...,...,...
182,VR0091,Obligasi Negara Republik Indonesia Seri VR0091,GOVT,2030-12-29,,2.334195e+12,Goverment Bond
183,VR0092,Obligasi Negara Republik Indonesia Seri VR0092,GOVT,2027-12-29,,2.981024e+13,Goverment Bond
184,VR0093,Obligasi Negara Republik Indonesia Seri VR0093,GOVT,2028-12-29,,2.981024e+13,Goverment Bond
185,VR0094,Obligasi Negara Republik Indonesia Seri VR0094,GOVT,2029-12-29,,2.981024e+13,Goverment Bond


## Close and Quit Driver

In [6]:
driver.quit()

# Scrape Bond Details

## Get Bond Details Function

In [7]:
## Well, the website has a weird issue, i can access medium term notes with url intended for corporate / govt bonds
## MTN example: https://www.ksei.co.id/services/registered-securities/medium-term-notes/lc/ABLS01XXMF
## Different URL example: https://www.ksei.co.id/services/registered-securities/corporate-bonds/lc/ABLS01XXMF
## Try it and you can still access the medium term notes
# 'https://www.ksei.co.id/services/registered-securities/medium-term-notes/lc/ABLS01XXMF'
# 'https://www.ksei.co.id/services/registered-securities/government-bonds/lc/FR0037'

def get_bond_details(BondId):
    while True:
        try:
            url = 'https://www.ksei.co.id/services/registered-securities/corporate-bonds/lc/' + BondId
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')

            data = {}

            # Find the dl tag with class="deflist deflist--with-colon"
            dl_tag = soup.find('dl', class_='deflist deflist--with-colon')

            # Loop through all dt tags within the dl tag and get their text values
            dt_tags = dl_tag.find_all('dt')
            for dt in dt_tags:
                # Get the text value of the dt tag
                dt_text = dt.get_text(strip=True)
                # Get the corresponding dd tag and its text value
                # find_next_sibling is actually an important function and it's new for me xD
                dd_text = dt.find_next_sibling('dd').get_text(strip=True)
                # Add the dd_text to the data dictionary with the dt_text as the key
                data[dt_text] = dd_text
            break
        except AttributeError:
            print(url, 'attribute error, will retry!')
    
    time.sleep(2)

    return data

## Multithreading with Progress Bar

In [8]:
df_list = []

with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
    futures = []
    for BondId in BEIBondsListDF['BondId']:
        future = executor.submit(get_bond_details, BondId)
        futures.append(future)
    
    # Use tqdm to add a progress bar to the multithreading process
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(BEIBondsListDF['BondId'])):
        df_list.append(future.result())

100%|██████████████████████████████████████████████████████████████████████████████| 1334/1334 [08:45<00:00,  2.54it/s]


## Join All Bond Details and Cleaning

### Join Bond Details

In [9]:
BondDetailsDF = pd.DataFrame(df_list)
BondDetailsDF

Unnamed: 0,Security name,Issuer,ISIN Code,Short Code,Type,Listing Date,Stock Exchange,Status,Nominal,Current Amount,...,Interest/Disc Rate,Interest Type,Interest Frequency,Currency,Form,Effective Date ISIN,Day Count Basis,Activity Sector,Number of Securities,Exercise Price
0,OBLIGASI BERKELANJUTAN II ADHI KARYA TAHAP II ...,"ADHI KARYA (PERSERO) Tbk, PT",IDA0000984B5,ADHI02BCN2,Straight Bonds,26 Juni 2019,IDX,Active,473500000000.00,473500000000.00,...,9.75%,FIXED,3 MONTHS,IDR,Electronic,-,-,BUILDING CONSTRUCTION,0 (Total),
1,OBLIGASI II ADHI COMMUTER PROPERTI TAHUN 2022 ...,"ADHI COMMUTER PROPERTI Tbk, PT",IDA0001241A1,ADCP02A,Straight Bonds,25 Mei 2022,IDX,Active,205500000000.00,0.00,...,10%,FIXED,3 MONTHS,IDR,Electronic,-,-,PROPERTY AND REAL ESTATE,0 (Total),
2,OBLIGASI BERKELANJUTAN III ADHI KARYA TAHAP II...,"ADHI KARYA (PERSERO) Tbk, PT",IDA0001239A5,ADHI03ACN3,Straight Bonds,25 Mei 2022,IDX,Active,1286200000000.00,0.00,...,8.25%,FIXED,3 MONTHS,IDR,Electronic,-,-,BUILDING CONSTRUCTION,0 (Total),
3,MTN ASIAN BULK LOGISTICS I TAHUN 2022,"ASIAN BULK LOGISTICS, PT",IDH000071307,ABLS01XXMF,MTN,-,-,Active,1000000000000.00,1000000000000.00,...,9%,Fixed,3 MONTHS,IDR,Electronic,-,-,TRANSPORTATION,0 (Total),
4,OBLIGASI AB SINAR MAS MULTIFINANCE I TAHUN 202...,"AB SINAR MAS MULTIFINANCE, PT",IDA0001097C3,ABSM01C,Straight Bonds,07 September 2020,IDX,Active,35000000000.00,0.00,...,11%,FIXED,3 MONTHS,IDR,Electronic,-,-,FINANCIAL INSTITUTION,0 (Total),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,OBLIGASI NEGARA REPUBLIK INDONESIA SERI VR0090,PEMERINTAH REPUBLIK INDONESIA,IDG000022500,VR0090,Government Bonds,30 Desember 2022,IDX,Active,2334195000000.00,2334195000000.00,...,6.25682%,Floating/Variable,3 MONTHS,IDR,Electronic,-,-,,0 (Total),
1330,OBLIGASI NEGARA REPUBLIK INDONESIA SERI VR0091,PEMERINTAH REPUBLIK INDONESIA,IDG000022609,VR0091,Government Bonds,30 Desember 2022,IDX,Active,2334195000000.00,2334195000000.00,...,6.25682%,Floating/Variable,3 MONTHS,IDR,Electronic,-,-,,0 (Total),
1331,OBLIGASI NEGARA REPUBLIK INDONESIA SERI VR0093,PEMERINTAH REPUBLIK INDONESIA,IDG000022807,VR0093,Government Bonds,30 Desember 2022,IDX,Active,29810235000000.00,29810235000000.00,...,6.25682%,Floating/Variable,3 MONTHS,IDR,Electronic,-,-,,0 (Total),
1332,OBLIGASI NEGARA REPUBLIK INDONESIA SERI VR0094,PEMERINTAH REPUBLIK INDONESIA,IDG000022906,VR0094,Government Bonds,30 Desember 2022,IDX,Active,29810235000000.00,29810235000000.00,...,6.25682%,Floating/Variable,3 MONTHS,IDR,Electronic,-,-,,0 (Total),


In [10]:
BondDetailsDF.columns

Index(['Security name', 'Issuer', 'ISIN Code', 'Short Code', 'Type',
       'Listing Date', 'Stock Exchange', 'Status', 'Nominal', 'Current Amount',
       'Mature Date', 'Interest/Disc Rate', 'Interest Type',
       'Interest Frequency', 'Currency', 'Form', 'Effective Date ISIN',
       'Day Count Basis', 'Activity Sector', 'Number of Securities',
       'Exercise Price'],
      dtype='object')

### Data Transformation

1. Some dates are written in Indonesian format string, 'May' is written as 'Mei', so I use dateparser library to tackle this issue and convert it to pandas datetime column
2. Interest rate format is string, convert it to float32
3. Replace '-' string with NaN

In [11]:
BondDetailsDF['Listing Date'] = BondDetailsDF['Listing Date'].apply(lambda x: dateparser.parse(x) if x != '-' else np.nan)
BondDetailsDF['Mature Date'] = BondDetailsDF['Mature Date'].apply(lambda x: dateparser.parse(x) if x != '-' else np.nan)
BondDetailsDF['Effective Date ISIN'] = BondDetailsDF['Effective Date ISIN'].apply(lambda x: dateparser.parse(x) if x != '-' else np.nan)
BondDetailsDF['Interest/Disc Rate'] = BondDetailsDF['Interest/Disc Rate'].replace('%', '', regex=True).apply('float32')
BondDetailsDF = BondDetailsDF.replace('-', np.nan)

TypeError: Input type must be str

In [None]:
BondDetailsDF.describe(include='all')

### Drop Unnecessary Columns

1. Every column dropped has only a few value

In [None]:
BondDetailsDF = BondDetailsDF.drop(columns=['Current Amount', 'Effective Date ISIN', 'Day Count Basis', 'Exercise Price'])

## Export to Excel

In [None]:
BondDetailsDF.to_excel('bonds.xlsx', index=False)