Code History:
1. Version 1.0 (2023/03/09):
    - Base version, working as expected

<strong>Features:</strong>
- Scrape IDX individual stock summary and details
- Scrape IDX sectoral stock summary and its components
- Scrape IDX stock index summary

Plan: Data is scraped <strong>every weekday on 6PM GMT+7</strong>, few hours after the market has closed for the day. So the data you see before 6PM is previous trading day data.

In [1]:
import json
from json.decoder import JSONDecodeError
import pandas as pd
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import threading
import concurrent.futures
from tqdm import tqdm

# Chrome Selenium Starter

Why Selenium? Because I need it to bypass cloudfare restriction

In [2]:
# Initialize the Chrome driver
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

# Scrape Summary URL

## URL List

In [3]:
urls = {
    'BEIStockSummary':'https://www.idx.co.id/primary/TradingSummary/GetStockSummary?length=9999&start=0',
    'BEISectoralSummary':'https://www.idx.co.id/primary/StockData/GetIndexIC',
    'BEIIndexSummary':'https://www.idx.co.id/primary/StockData/GetConstituent'
}

## BEI Stock Summary

In [4]:
driver.get(urls['BEIStockSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEIStockSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [5]:
BEIStockSummaryDF = pd.DataFrame(json.loads(BEIStockSummaryContent)['data']).drop(columns=['No'])
BEIStockSummaryDF

Unnamed: 0,IDStockSummary,Date,StockCode,StockName,Remarks,Previous,OpenPrice,FirstTrade,High,Low,...,TradebleShares,WeightForIndex,ForeignSell,ForeignBuy,DelistingDate,NonRegularVolume,NonRegularValue,NonRegularFrequency,persen,percentage
0,3292988,2023-03-09T00:00:00,AALI,Astra Agro Lestari Tbk.,--M-18AEM16000D232------------,8025.0,0.0,0.0,8150.0,8000.0,...,1.924688e+09,3.907117e+08,103300.0,85700.0,,0.0,0.0,0.0,,
1,3292989,2023-03-09T00:00:00,ABBA,Mahaka Media Tbk.,--U-2100000000E614------------,120.0,0.0,0.0,124.0,119.0,...,3.935893e+09,1.342139e+09,0.0,0.0,,0.0,0.0,0.0,,
2,3292990,2023-03-09T00:00:00,ABDA,Asuransi Bina Dana Arta Tbk.,--U-2105000000G412------------,6500.0,0.0,0.0,6550.0,6550.0,...,6.208067e+08,3.147490e+07,0.0,0.0,,0.0,0.0,0.0,,
3,3292991,2023-03-09T00:00:00,ABMM,ABM Investama Tbk.,--M-1835000000C311------------,2700.0,0.0,0.0,2820.0,2700.0,...,2.753165e+09,5.688039e+08,262300.0,217900.0,,0.0,0.0,0.0,,
4,3292992,2023-03-09T00:00:00,ACES,Ace Hardware Indonesia Tbk.,--MO1S35MD0000E743------------,486.0,488.0,488.0,505.0,486.0,...,1.715000e+10,6.832560e+09,8958300.0,14957000.0,,8.0,3888.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,3293837,2023-03-09T00:00:00,ZATA,Bersama Zatta Jaya Tbk.,--U-1130000000E741------------,70.0,0.0,0.0,72.0,68.0,...,1.700000e+09,1.099382e+09,513500.0,640200.0,,0.0,0.0,0.0,,
850,3293838,2023-03-09T00:00:00,ZBRA,Dosni Roha Indonesia Tbk.,--U-2135000000C311------------,498.0,0.0,0.0,500.0,498.0,...,2.510706e+09,4.245604e+08,200.0,105000.0,,0.0,0.0,0.0,,
851,3293839,2023-03-09T00:00:00,ZINC,Kapuas Prima Coal Tbk.,--U-2105000000B146------------,50.0,0.0,0.0,50.0,50.0,...,2.525000e+10,7.456325e+09,0.0,21200.0,,203300.0,9148500.0,1.0,,
852,3293840,2023-03-09T00:00:00,ZONE,Mega Perintis Tbk.,--M-2135000000E741------------,1100.0,0.0,0.0,1115.0,1100.0,...,8.701715e+08,1.161679e+08,0.0,0.0,,0.0,0.0,0.0,,


## Close and Quit Driver

In [12]:
driver.quit()

# Scrape Stock Details URL

## Company Profiles

In [13]:
def get_company_profiles(driver, stock):
    while True:
        try:
            company_profiles_url = 'https://www.idx.co.id/primary/ListedCompany/GetCompanyProfilesDetail?KodeEmiten=' + stock
            driver.get(company_profiles_url)

            WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

            CompanyProfilesContent = driver.find_element(By.TAG_NAME, value='body').text
            CompanyProfilesRow = pd.DataFrame(json.loads(CompanyProfilesContent)['Profiles'])
            CompanyProfilesRow.insert(0, 'StockCode', stock)
            
            break
        except JSONDecodeError as e:
            time.sleep(1.5)
#             print(stock, 'Company Profiles JSON is not available!', 'Retrying!')
    
    time.sleep(1)    
    
    return CompanyProfilesRow

## Today Trading Info

In [14]:
def get_today_trading_info(driver, stock):
    while True:
        try:
            trading_info_url = 'https://www.idx.co.id/primary/ListedCompany/GetTradingInfoDaily?code=' + stock
            driver.get(trading_info_url)

            WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

            TradingInfoContent = driver.find_element(By.TAG_NAME, value='body').text
            TradingInfoRow = pd.DataFrame.from_dict(json.loads(TradingInfoContent), orient='index').T
            TradingInfoRow.insert(0, 'StockCode', stock)
            
            break
        except JSONDecodeError as e:
            time.sleep(1.5)
#             print(stock, 'Trading Info JSON is not available!', 'Retrying!')
    
    time.sleep(1)
    
    return TradingInfoRow

## Financial Reports File Links
Maximum last 3 years (Current: 2023, Min: 2021)

In [15]:
def get_financial_report_file_links(driver, stock):
    current_year = datetime.now().year
    # last 2 years
    years = [current_year, current_year-1]
    periods = ['TW1', 'TW2', 'TW3', 'Audit']
    
    FinancialReportRows = pd.DataFrame()

    for year in years:
        for period in periods:              
            while True:
                try:
                    financial_report_url = 'https://www.idx.co.id/primary/ListedCompany/GetFinancialReport?periode={}&year={}&indexFrom=0&pageSize=1000&reportType=rdf&kodeEmiten={}'.format(period, year, stock)
                    driver.get(financial_report_url)

                    WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

                    FinancialReportContent = driver.find_element(By.TAG_NAME, value='body').text

                    if json.loads(FinancialReportContent)['ResultCount'] > 0:
                        FinancialReportRow = pd.DataFrame(json.loads(FinancialReportContent)['Results'][0]['Attachments'])
                        FinancialReportRow = FinancialReportRow.rename(columns={'Emiten_Code':'StockCode'})
                        FinancialReportRows = pd.concat([FinancialReportRows, FinancialReportRow])
                    
                    break
                except JSONDecodeError as e:
                    time.sleep(1.5)
#                     print(stock, year, period, 'JSON is not available!', 'Retrying!')

        time.sleep(2)

    return FinancialReportRows

## Multithreading Scrape

### Worker Function

In [16]:
# Define a worker function that takes stock codes from the queue and loads them in parallel
def load_stocks(stock):
    options = Options()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)

    company_profiles = get_company_profiles(driver, stock)
    today_trading_info = get_today_trading_info(driver, stock)
    financial_report_links = get_financial_report_file_links(driver, stock)
    
    return company_profiles, today_trading_info, financial_report_links

### Create list to store scraped data

In [17]:
results = {
    'CompanyProfiles':[],
    'TodayTradingInfo':[],
    'FinancialReportLinks':[]
}

In [19]:
with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
    futures = []
    
    for StockCode in BEIStockSummaryDF['StockCode']:
        future = executor.submit(load_stocks, StockCode)
        futures.append(future)
    
    # Use tqdm to add a progress bar to the multithreading process
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(BEIStockSummaryDF['StockCode'])):
        company_profiles, today_trading_info, financial_report_links = future.result()
        results['CompanyProfiles'].append(company_profiles)
        results['TodayTradingInfo'].append(today_trading_info)
        results['FinancialReportLinks'].append(financial_report_links)

ThreadPoolExecutor


100%|█████████████████████████████████████████████████████████████████████████████| 854/854 [00:00<00:00, 90505.49it/s]


Append return value to dict list


100%|████████████████████████████████████████████████████████████████████████████████| 854/854 [26:00<00:00,  1.83s/it]


## Join All Stock Details

In [20]:
CompanyProfilesDF = pd.concat(results['CompanyProfiles']).reset_index(drop=True).drop(
    columns=[
        'DataID', 'Divisi', 'EfekEmiten_EBA', 'EfekEmiten_ETF', 
        'EfekEmiten_Obligasi', 'EfekEmiten_SPEI', 'EfekEmiten_Saham',
        'id', 'KodeDivisi', 'JenisEmiten', 'KodeEmiten', 'Status'
    ]
)
CompanyProfilesDF['TanggalPencatatan'] = pd.to_datetime(CompanyProfilesDF['TanggalPencatatan']).dt.normalize()
CompanyProfilesDF['Logo'] = ['https://www.idx.co.id' + logo for logo in CompanyProfilesDF['Logo']]
CompanyProfilesDF

Unnamed: 0,StockCode,Alamat,BAE,Industri,SubIndustri,Email,Fax,id,KegiatanUsahaUtama,NamaEmiten,NPKP,NPWP,PapanPencatatan,Sektor,SubSektor,TanggalPencatatan,Telepon,Website,Logo
0,ACES,Gedung Kawan Lama Lantai 5. Jl. Puri Kencana N...,PT. EDI Indonesia,Ritel Khusus,Ritel Barang Rumah Tangga,\ttoto@acehardware.co.id,\t021 - 5824022,0.0,Perdagangan,Ace Hardware Indonesia Tbk,,\t01.721.123.6-054.000,Utama,Barang Konsumen Non-Primer,Perdagangan Ritel,2007-11-06,\t021 - 5822222,www.acehardware.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...
1,ADCP,"Jl. Pengantin Ali No. 88, Ciracas\r\nKota Jaka...",,Pengelola & Pengembang Real Estat,Pengembang & Operator Real Estat,corsec@adcp.co.id,(021) 228 220 81,0.0,Perhotelan dan Real Estate,PT Adhi Commuter Properti Tbk,,85.227.029.7-093.000,Utama,Properti & Real Estat,Properti & Real Estat,2021-05-21,(021) 228 229 80,www.adcp.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...
2,ABDA,Plaza Asia Lt. 27 Jl. Jend. Sudirman Kav. 59 J...,PT. Raya Saham Registra,Asuransi,Asuransi Umum,contactus@abda.co.id,5140 1698,0.0,Asuransi Kerugian,Asuransi Bina Dana Arta Tbk,,01.362.014.1-054.000,Pengembangan,Keuangan,Asuransi,1989-07-06,5140 1688,www.abda.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...
3,ABBA,"Sahid Sudirman Centre Lt. 10, Jl. Jend. Sudirm...",PT. Adimitra Jasa Korpora,Media,Penerbitan,corsec@mahakax.com,(021) 573 9210,0.0,Media dan Percetakan,Mahaka Media Tbk,0,01.609.052.4-054.000,Pengembangan,Barang Konsumen Non-Primer,Media & Hiburan,2002-04-03,(021) 573 9203,www.mahakax.com,https://www.idx.co.id/Portals/0/StaticData/Lis...
4,ABMM,Gedung TMT 1 Lantai 18 Jl. Cilandak KKO No. 1 ...,PT. Datindo Entrycom,Perusahaan Holding Multi-sektor,Perusahaan Holding Multi-sektor,corporate.secretary@abm-investama.co.id,021-29976768,0.0,berupa jasa penyewaan namun tidak terbatas pad...,ABM Investama Tbk,PEM-01535/WPJ.04/KP.1003/2008,02.504.191.4-054.000,Utama,Perindustrian,Perusahaan Holding Multi Sektor,2011-12-06,021-29976767,www.abm-investama.com,https://www.idx.co.id/Portals/0/StaticData/Lis...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,ZATA,"Komplek Industri Prapanca No.24, Cigondewah Ka...",PT. Adimitra Jasa Korpora,Ritel Khusus,Ritel Pakaian & Tekstil,corporate@elcorps.com,,0.0,Aktivitas Perusahaan Holding dan aktivitas Usa...,PT Bersama Zatta Jaya Tbk,,31.569.863.9-428.000,Utama,Barang Konsumen Non-Primer,Perdagangan Ritel,2022-11-10,(022) 86017900,www.elcorps.com,https://www.idx.co.id/Portals/0/StaticData/Lis...
847,ZBRA,"Satrio Tower Lt. 23, Jl. Prof Dr. Satrio Kav. ...",PT BSR Indonesia,Perusahaan Holding Multi-sektor,Perusahaan Holding Multi-sektor,legal@zebranusantara.co.id,(021) 2788 3914,0.0,Perusahaan Holding dan Angkutan Bermotor untuk...,PT Dosni Roha Indonesia Tbk,,01.451.869.0-054.000,Pengembangan,Perindustrian,Perusahaan Holding Multi Sektor,1991-08-01,(021) 2788 3900,www.zebranusantara.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...
848,ZINC,"Jl. Pantai Indah Selatan I, Elang Laut Blok A ...",,Logam & Mineral,Logam & Mineral Lainnya,corsec@ptkpc.com,021 - 29676234,0.0,Pertambangan dan Perdagangan,PT Kapuas Prima Coal Tbk,,02.386.972.0-091.000,Pengembangan,Barang Baku,Barang Baku,2017-10-16,021 - 29676236,www.ptkpc.com,https://www.idx.co.id/Portals/0/StaticData/Lis...
849,ZONE,"Jalan Karet Pedurenan No. 240, Karet Kuningan,...",PT. Bima Registra,Ritel Khusus,Ritel Pakaian & Tekstil,corpsec@megaperintis.co.id,(021) 5290 5103,0.0,Perdagangan retail,PT Mega Perintis Tbk.,,02.433.917.8-011.000,Pengembangan,Barang Konsumen Non-Primer,Perdagangan Ritel,2018-12-12,(021) 5733 888; (021) 5290 4379,www.megaperintis.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...


In [21]:
TodayTradingInfoDF = pd.concat(results['TodayTradingInfo']).reset_index(drop=True)[[
    'StockCode', 'PreviousPrice', 'OpeningPrice', 
    'HighestPrice', 'LowestPrice', 'ClosingPrice', 
    'Change', 'TradedVolume', 'TradedFrequency'
]]
TodayTradingInfoDF.insert(1, 'TradingDate', pd.to_datetime(datetime.today()).normalize())
TodayTradingInfoDF

Unnamed: 0,StockCode,TradingDate,PreviousPrice,OpeningPrice,HighestPrice,LowestPrice,ClosingPrice,Change,TradedVolume,TradedFrequency
0,ACES,2023-03-09,486.0,488.0,505.0,486.0,500.0,14.0,56735500.0,4631.0
1,ADCP,2023-03-09,65.0,0.0,66.0,64.0,65.0,0.0,1308300.0,169.0
2,ABDA,2023-03-09,6500.0,0.0,6550.0,6550.0,6550.0,50.0,100.0,1.0
3,ABBA,2023-03-09,120.0,0.0,124.0,119.0,123.0,3.0,3553600.0,393.0
4,ABMM,2023-03-09,2700.0,0.0,2820.0,2700.0,2790.0,90.0,1575200.0,605.0
...,...,...,...,...,...,...,...,...,...,...
849,ZATA,2023-03-09,70.0,0.0,72.0,68.0,69.0,-1.0,38973200.0,2538.0
850,ZBRA,2023-03-09,540.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
851,ZINC,2023-03-09,40.0,0.0,45.0,45.0,45.0,5.0,203300.0,1.0
852,ZONE,2023-03-09,1100.0,0.0,1115.0,1100.0,1115.0,15.0,106000.0,56.0


In [25]:
FinancialReportLinksDF = pd.concat(results['FinancialReportLinks']).reset_index(drop=True).drop(
    columns=['File_ID', 'File_Size', 'File_Type']
)
FinancialReportLinksDF['File_Modified'] = pd.to_datetime(FinancialReportLinksDF['File_Modified']).dt.normalize()
FinancialReportLinksDF['File_Path'] = 'https://www.idx.co.id/' + FinancialReportLinksDF['File_Path']
FinancialReportLinksDF

Unnamed: 0,StockCode,File_Modified,File_Name,File_Path,Report_Period,Report_Type,Report_Year,NamaEmiten
0,ACES,2022-04-29,inlineXBRL.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW1,rdf,2022,Ace Hardware Indonesia Tbk
1,ACES,2022-04-29,LK ACES 1Q22.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW1,rdf,2022,Ace Hardware Indonesia Tbk
2,ACES,2022-04-29,FinancialStatement-2022-I-ACES.xlsx,https://www.idx.co.id//Portals/0/StaticData/Li...,TW1,rdf,2022,Ace Hardware Indonesia Tbk
3,ACES,2022-04-29,instance.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW1,rdf,2022,Ace Hardware Indonesia Tbk
4,ACES,2022-04-29,SPD ACES 1Q22.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW1,rdf,2022,Ace Hardware Indonesia Tbk
...,...,...,...,...,...,...,...,...
15187,ZYRX,2022-12-22,SDP LK Q3 2022.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk
15188,ZYRX,2022-12-22,Q3 Report PT Zyrexindo Mandiri Buana Tbk.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk
15189,ZYRX,2022-12-22,Penjelasan Perubahan Aset - Liabilitas 30 Sep ...,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk
15190,ZYRX,2022-12-22,instance.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk


## Export to Excel

In [28]:
with pd.ExcelWriter('stocks.xlsx') as writer:
    CompanyProfilesDF.to_excel(writer, sheet_name='Company Profiles', index=False)
    TodayTradingInfoDF.to_excel(writer, sheet_name='Today Trading Info', index=False)
    FinancialReportLinksDF.to_excel(writer, sheet_name='Financial Reports', index=False)