Code History:<br>
1. Version 1.0 (2023/03/09):<br>
    - Base version, working as expected

<strong>Features:</strong><br>
- Scrape IDX stock sectoral summary<br>
- Scrape IDX stock index summary<br>
<br>
Plan: Data is scraped <strong>every weekday on 6PM GMT+7</strong>, few hours after the market has closed for the day. So the data you see before 6PM is previous trading day data.

In [1]:
import json
from json.decoder import JSONDecodeError
import pandas as pd
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import threading
import concurrent.futures
from tqdm import tqdm
import os
import sqlalchemy
from sqlalchemy import create_engine

# Chrome Selenium Starter<br>
<br>
Why Selenium? Because I need it to bypass cloudfare restriction

Initialize the Chrome driver

In [2]:
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# Scrape Summary URL

## URL List

In [3]:
urls = {
    'BEISectoralSummary':'https://www.idx.co.id/primary/StockData/GetIndexIC',
    'BEIIndexSummary':'https://www.idx.co.id/primary/StockData/GetConstituent',
}

## BEI Sectoral Summary

In [4]:
driver.get(urls['BEISectoralSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEISectoralSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [5]:
while True:
    try:
        BEISectoralSummaryDF = pd.DataFrame(json.loads(BEISectoralSummaryContent)['data']).drop(columns='IntRow')
        BEISectoralSummaryDF['DTCreate'] = pd.to_datetime(BEISectoralSummaryDF['DTCreate']).dt.normalize()
        BEISectoralSummaryDF['LastScraped'] = datetime.now()
        BEISectoralSummaryDF
        break
    except JSONDecodeError as e:
        time.sleep(1.5)

In [6]:
PrevSectoralSummary = pd.read_excel('stock_index_sectoral.xlsx', sheet_name='Sectoral Summary')
BEISectoralSummaryDF = pd.concat(
    [BEISectoralSummaryDF, PrevSectoralSummary]
).sort_values(
    by=['DTCreate', 'LastScraped']
).drop_duplicates(
    subset=['IndexCode', 'DTCreate'],
    keep='first'
)
BEISectoralSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,IDXENERGY,1946.869,2013.924,1946.869,2013.924,67.06,3.44,2023-03-17,2023-03-19 20:07:53.489000
1,IDXBASIC,1106.896,1127.315,1106.896,1125.892,19.0,1.72,2023-03-17,2023-03-19 20:07:53.489000
2,IDXINDUST,1130.866,1151.763,1130.866,1151.763,20.9,1.85,2023-03-17,2023-03-19 20:07:53.489000
3,IDXNONCYC,710.956,724.142,710.956,717.836,6.88,0.97,2023-03-17,2023-03-19 20:07:53.489000
4,IDXCYCLIC,789.999,801.053,789.73,795.524,5.53,0.7,2023-03-17,2023-03-19 20:07:53.489000
5,IDXHEALTH,1534.586,1562.97,1534.586,1562.97,28.38,1.85,2023-03-17,2023-03-19 20:07:53.489000
6,IDXFINANCE,1339.677,1365.554,1339.677,1364.895,25.22,1.88,2023-03-17,2023-03-19 20:07:53.489000
7,IDXPROPERT,664.515,667.583,663.032,664.827,0.31,0.05,2023-03-17,2023-03-19 20:07:53.489000
8,IDXTECHNO,5028.5,5130.004,4971.998,5023.903,-4.6,-0.09,2023-03-17,2023-03-19 20:07:53.489000
9,IDXINFRA,795.948,811.947,795.948,808.279,12.33,1.55,2023-03-17,2023-03-19 20:07:53.489000


## BEI Index Summary

In [7]:
driver.get(urls['BEIIndexSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEIIndexSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [8]:
while True:
    try:
        BEIIndexSummaryDF = pd.DataFrame(json.loads(BEIIndexSummaryContent)['Items']).drop(columns='Links')
        BEIIndexSummaryDF['DtCreate'] = pd.to_datetime(BEIIndexSummaryDF['DtCreate']).dt.normalize()
        BEIIndexSummaryDF = BEIIndexSummaryDF.rename(columns={'DtCreate':'DTCreate'})
        BEIIndexSummaryDF['LastScraped'] = datetime.now()
        BEIIndexSummaryDF
        break
    except JSONDecodeError as e:
        time.sleep(1.5)

In [9]:
PrevIndexSummary = pd.read_excel('stock_index_sectoral.xlsx', sheet_name='Index Summary')
BEIIndexSummaryDF = pd.concat(
    [BEIIndexSummaryDF, PrevIndexSummary]
).sort_values(
    by=['DTCreate', 'LastScraped']
).drop_duplicates(
    subset=['IndexCode', 'DTCreate'],
    keep='first'
)
BEIIndexSummaryDF

Unnamed: 0,IndexCode,PrevVal,HighVal,LowVal,LastVal,ChgVal,ChgPct,DTCreate,LastScraped
0,COMPOSITE,6565.728,6683.094,6565.728,6678.237,112.51,1.71,2023-03-17,2023-03-19 22:34:04.429000
1,MBX,1812.606,1847.457,1812.606,1845.111,32.51,1.79,2023-03-17,2023-03-19 22:34:04.429000
2,DBX,1568.220,1583.489,1568.220,1583.489,15.27,0.97,2023-03-17,2023-03-19 22:34:04.429000
3,KOMPAS100,1118.593,1140.513,1118.593,1138.755,20.16,1.80,2023-03-17,2023-03-19 22:34:04.429000
4,INFOBANK15,1096.891,1123.753,1104.197,1118.152,21.26,1.94,2023-03-17,2023-03-19 22:34:04.429000
...,...,...,...,...,...,...,...,...,...
26,ESGSKEHATI,134.709,134.709,133.219,133.945,-0.76,-0.57,2023-03-20,2023-03-20 17:57:21.513161
27,IDX80,128.631,128.631,126.572,127.185,-1.45,-1.12,2023-03-20,2023-03-20 17:57:21.513161
28,IDXLQ45LCL,127.880,127.985,126.205,126.966,-0.91,-0.72,2023-03-20,2023-03-20 17:57:21.513161
29,IDXSHAGROW,105.145,105.145,103.437,103.895,-1.25,-1.19,2023-03-20,2023-03-20 17:57:21.513161


## Close and Quit Driver

In [10]:
driver.quit()

# Export Result

## Export to Excel

In [12]:
# with pd.ExcelWriter('stock_index_sectoral.xlsx') as writer:
#     BEISectoralSummaryDF.to_excel(writer, sheet_name='Sectoral Summary', index=False)
#     BEIIndexSummaryDF.to_excel(writer, sheet_name='Index Summary', index=False)

## Export to DB

In [13]:
engine = create_engine(
    "postgresql://{}:{}@{}/{}".format(
        os.getenv('POSTGRE_USER'), os.getenv('POSTGRE_PW'), os.getenv('POSTGRE_HOST'), os.getenv('POSTGRE_DB')
    )
)
conn = engine.connect()

OperationalError: (psycopg2.OperationalError) could not translate host name "None" to address: Unknown host

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
BEISectoralSummaryDF.to_sql('BEISectoralSummary', con=conn, if_exists='replace', index=False)
BEIIndexSummaryDF.to_sql('BEIIndexSummary', con=conn, if_exists='replace', index=False)