Code History:
1. Version 1.0 (2023/03/09):
    - Base version, working as expected

<strong>Features:</strong>
- Scrape IDX stock sectoral summary
- Scrape IDX stock index summary

Plan: Data is scraped <strong>every weekday on 6PM GMT+7</strong>, few hours after the market has closed for the day. So the data you see before 6PM is previous trading day data.

In [1]:
import json
from json.decoder import JSONDecodeError
import pandas as pd
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import threading
import concurrent.futures
from tqdm import tqdm

# Chrome Selenium Starter

Why Selenium? Because I need it to bypass cloudfare restriction

In [2]:
# Initialize the Chrome driver
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

# Scrape Summary URL

## URL List

In [3]:
urls = {
    'BEISectoralSummary':'https://www.idx.co.id/primary/StockData/GetIndexIC',
    'BEIIndexSummary':'https://www.idx.co.id/primary/StockData/GetConstituent',
}

## BEI Sectoral Summary

In [4]:
driver.get(urls['BEISectoralSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEISectoralSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [5]:
BEISectoralSummaryDF = pd.DataFrame(json.loads(BEISectoralSummaryContent)['data']).drop(columns='IntRow')
BEISectoralSummaryDF['DTCreate'] = pd.to_datetime(BEISectoralSummaryDF['DTCreate']).dt.normalize()
BEISectoralSummaryDF['LastScraped'] = datetime.now()
BEISectoralSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,IDXENERGY,1946.869,2013.924,1946.869,2013.924,67.06,3.44,2023-03-17,2023-03-19 22:34:48.166764
1,IDXBASIC,1106.896,1127.315,1106.896,1125.892,19.0,1.72,2023-03-17,2023-03-19 22:34:48.166764
2,IDXINDUST,1130.866,1151.763,1130.866,1151.763,20.9,1.85,2023-03-17,2023-03-19 22:34:48.166764
3,IDXNONCYC,710.956,724.142,710.956,717.836,6.88,0.97,2023-03-17,2023-03-19 22:34:48.166764
4,IDXCYCLIC,789.999,801.053,789.73,795.524,5.53,0.7,2023-03-17,2023-03-19 22:34:48.166764
5,IDXHEALTH,1534.586,1562.97,1534.586,1562.97,28.38,1.85,2023-03-17,2023-03-19 22:34:48.166764
6,IDXFINANCE,1339.677,1365.554,1339.677,1364.895,25.22,1.88,2023-03-17,2023-03-19 22:34:48.166764
7,IDXPROPERT,664.515,667.583,663.032,664.827,0.31,0.05,2023-03-17,2023-03-19 22:34:48.166764
8,IDXTECHNO,5028.5,5130.004,4971.998,5023.903,-4.6,-0.09,2023-03-17,2023-03-19 22:34:48.166764
9,IDXINFRA,795.948,811.947,795.948,808.279,12.33,1.55,2023-03-17,2023-03-19 22:34:48.166764


In [6]:
PrevSectoralSummary = pd.read_excel('stock_index_sectoral.xlsx', sheet_name='Sectoral Summary')
BEISectoralSummaryDF = pd.concat(
    [BEISectoralSummaryDF, PrevSectoralSummary]
).sort_values(
    by=['DTCreate', 'LastScraped']
).drop_duplicates(
    subset=['IndexCode', 'DTCreate'],
    keep='first'
)
BEISectoralSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,IDXENERGY,1946.869,2013.924,1946.869,2013.924,67.06,3.44,2023-03-17,2023-03-19 20:07:53.489
1,IDXBASIC,1106.896,1127.315,1106.896,1125.892,19.0,1.72,2023-03-17,2023-03-19 20:07:53.489
2,IDXINDUST,1130.866,1151.763,1130.866,1151.763,20.9,1.85,2023-03-17,2023-03-19 20:07:53.489
3,IDXNONCYC,710.956,724.142,710.956,717.836,6.88,0.97,2023-03-17,2023-03-19 20:07:53.489
4,IDXCYCLIC,789.999,801.053,789.73,795.524,5.53,0.7,2023-03-17,2023-03-19 20:07:53.489
5,IDXHEALTH,1534.586,1562.97,1534.586,1562.97,28.38,1.85,2023-03-17,2023-03-19 20:07:53.489
6,IDXFINANCE,1339.677,1365.554,1339.677,1364.895,25.22,1.88,2023-03-17,2023-03-19 20:07:53.489
7,IDXPROPERT,664.515,667.583,663.032,664.827,0.31,0.05,2023-03-17,2023-03-19 20:07:53.489
8,IDXTECHNO,5028.5,5130.004,4971.998,5023.903,-4.6,-0.09,2023-03-17,2023-03-19 20:07:53.489
9,IDXINFRA,795.948,811.947,795.948,808.279,12.33,1.55,2023-03-17,2023-03-19 20:07:53.489


## BEI Index Summary

In [7]:
driver.get(urls['BEIIndexSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEIIndexSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [8]:
BEIIndexSummaryDF = pd.DataFrame(json.loads(BEIIndexSummaryContent)['Items']).drop(columns='Links')
BEIIndexSummaryDF['DtCreate'] = pd.to_datetime(BEIIndexSummaryDF['DtCreate']).dt.normalize()
BEIIndexSummaryDF = BEIIndexSummaryDF.rename(columns={'DtCreate':'DTCreate'})
BEIIndexSummaryDF['LastScraped'] = datetime.now()
BEIIndexSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,COMPOSITE,6565.728,6683.094,6565.728,6678.237,112.51,1.71,2023-03-17,2023-03-19 22:34:50.462419
1,MBX,1812.606,1847.457,1812.606,1845.111,32.51,1.79,2023-03-17,2023-03-19 22:34:50.462419
2,DBX,1568.22,1583.489,1568.22,1583.489,15.27,0.97,2023-03-17,2023-03-19 22:34:50.462419
3,KOMPAS100,1118.593,1140.513,1118.593,1138.755,20.16,1.8,2023-03-17,2023-03-19 22:34:50.462419
4,INFOBANK15,1096.891,1123.753,1104.197,1118.152,21.26,1.94,2023-03-17,2023-03-19 22:34:50.462419
5,LQ45,907.501,926.465,907.501,924.254,16.75,1.85,2023-03-17,2023-03-19 22:34:50.462419
6,BISNIS-27,566.974,579.335,566.974,577.761,10.79,1.9,2023-03-17,2023-03-19 22:34:50.462419
7,JII,546.038,560.2,546.038,560.2,14.16,2.59,2023-03-17,2023-03-19 22:34:50.462419
8,IDXHIDIV20,547.47,559.034,547.47,558.395,10.93,2.0,2023-03-17,2023-03-19 22:34:50.462419
9,IDX30,473.932,483.53,473.932,482.45,8.52,1.8,2023-03-17,2023-03-19 22:34:50.462419


In [9]:
PrevIndexSummary = pd.read_excel('stock_index_sectoral.xlsx', sheet_name='Index Summary')
BEIIndexSummaryDF = pd.concat(
    [BEIIndexSummaryDF, PrevIndexSummary]
).sort_values(
    by=['DTCreate', 'LastScraped']
).drop_duplicates(
    subset=['IndexCode', 'DTCreate'],
    keep='first'
)
BEIIndexSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,COMPOSITE,6565.728,6683.094,6565.728,6678.237,112.51,1.71,2023-03-17,2023-03-19 22:34:04.429
1,MBX,1812.606,1847.457,1812.606,1845.111,32.51,1.79,2023-03-17,2023-03-19 22:34:04.429
2,DBX,1568.22,1583.489,1568.22,1583.489,15.27,0.97,2023-03-17,2023-03-19 22:34:04.429
3,KOMPAS100,1118.593,1140.513,1118.593,1138.755,20.16,1.8,2023-03-17,2023-03-19 22:34:04.429
4,INFOBANK15,1096.891,1123.753,1104.197,1118.152,21.26,1.94,2023-03-17,2023-03-19 22:34:04.429
5,LQ45,907.501,926.465,907.501,924.254,16.75,1.85,2023-03-17,2023-03-19 22:34:04.429
6,BISNIS-27,566.974,579.335,566.974,577.761,10.79,1.9,2023-03-17,2023-03-19 22:34:04.429
7,JII,546.038,560.2,546.038,560.2,14.16,2.59,2023-03-17,2023-03-19 22:34:04.429
8,IDXHIDIV20,547.47,559.034,547.47,558.395,10.93,2.0,2023-03-17,2023-03-19 22:34:04.429
9,IDX30,473.932,483.53,473.932,482.45,8.52,1.8,2023-03-17,2023-03-19 22:34:04.429


## Close and Quit Driver

In [10]:
driver.quit()

## Export to Excel

In [11]:
with pd.ExcelWriter('stock_index_sectoral.xlsx') as writer:
    BEISectoralSummaryDF.to_excel(writer, sheet_name='Sectoral Summary', index=False)
    BEIIndexSummaryDF.to_excel(writer, sheet_name='Index Summary', index=False)