Code History:
1. Version 1.0 (2023/03/09):
    - Base version, working as expected
2. Version 1.1 (2023/03/19):
    - Everytime **Financial Report Links** code runs, it won't overwrite any previous data, instead it will append new data for the previous scraped data.

<strong>Features:</strong>
- Scrape IDX individual stock summary and details
- Scrape IDX sectoral stock summary and its components
- Scrape IDX stock index summary

Plan: Data is scraped <strong>every weekday on 6PM GMT+7</strong>, few hours after the market has closed for the day. So the data you see before 6PM is previous trading day data.

In [1]:
import json
from json.decoder import JSONDecodeError
import pandas as pd
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import threading
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Chrome Selenium Starter

Why Selenium? Because I need it to bypass cloudfare restriction

In [2]:
# Initialize the Chrome driver
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)

# Scrape Summary URL

## URL List

In [3]:
urls = {
    'BEIStockSummary':'https://www.idx.co.id/primary/TradingSummary/GetStockSummary?length=9999&start=0',
    'BEISectoralSummary':'https://www.idx.co.id/primary/StockData/GetIndexIC',
    'BEIIndexSummary':'https://www.idx.co.id/primary/StockData/GetConstituent'
}

## BEI Stock Summary

In [4]:
driver.get(urls['BEIStockSummary'])
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))
BEIStockSummaryContent = driver.find_element(By.TAG_NAME, value='body').text
time.sleep(2)

In [5]:
BEIStockSummaryDF = pd.DataFrame(json.loads(BEIStockSummaryContent)['data']).drop(columns=['No'])
BEIStockSummaryDF

Unnamed: 0,IDStockSummary,Date,StockCode,StockName,Remarks,Previous,OpenPrice,FirstTrade,High,Low,...,TradebleShares,WeightForIndex,ForeignSell,ForeignBuy,DelistingDate,NonRegularVolume,NonRegularValue,NonRegularFrequency,persen,percentage
0,3298117,2023-03-17T00:00:00,AALI,Astra Agro Lestari Tbk.,--M-18AEM16000D232------------,7875.0,0.0,0.0,8000.0,7875.0,...,1.924688e+09,3.907117e+08,869300.0,262900.0,,958.0,7621500.0,11.0,,
1,3298118,2023-03-17T00:00:00,ABBA,Mahaka Media Tbk.,--U-2100000000E614------------,92.0,0.0,0.0,101.0,87.0,...,3.935893e+09,1.342139e+09,0.0,0.0,,0.0,0.0,0.0,,
2,3298119,2023-03-17T00:00:00,ABDA,Asuransi Bina Dana Arta Tbk.,--U-2105000000G412------------,6475.0,0.0,0.0,0.0,0.0,...,6.208067e+08,3.147490e+07,0.0,0.0,,0.0,0.0,0.0,,
3,3298120,2023-03-17T00:00:00,ABMM,ABM Investama Tbk.,--M-1835000000C311------------,2640.0,0.0,0.0,2740.0,2650.0,...,2.753165e+09,5.688039e+08,160800.0,72100.0,,0.0,0.0,0.0,,
4,3298121,2023-03-17T00:00:00,ACES,Ace Hardware Indonesia Tbk.,--MO1S35MD0000E743------------,440.0,442.0,444.0,454.0,442.0,...,1.715000e+10,6.832560e+09,12230400.0,26713300.0,,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,3298967,2023-03-17T00:00:00,ZATA,Bersama Zatta Jaya Tbk.,--U-1130000000E741------------,60.0,0.0,0.0,63.0,56.0,...,1.700000e+09,1.099382e+09,40000.0,133200.0,,0.0,0.0,0.0,,
851,3298968,2023-03-17T00:00:00,ZBRA,Dosni Roha Indonesia Tbk.,--U-2135000000C311------------,492.0,0.0,0.0,496.0,488.0,...,2.510706e+09,4.245604e+08,0.0,195300.0,,0.0,0.0,0.0,,
852,3298969,2023-03-17T00:00:00,ZINC,Kapuas Prima Coal Tbk.,--U-2105000000B146------------,50.0,0.0,0.0,50.0,50.0,...,2.525000e+10,7.456325e+09,0.0,0.0,,0.0,0.0,0.0,,
853,3298970,2023-03-17T00:00:00,ZONE,Mega Perintis Tbk.,--M-2135000000E741------------,1150.0,0.0,0.0,1160.0,1145.0,...,8.701715e+08,1.161679e+08,0.0,0.0,,0.0,0.0,0.0,,


## Close and Quit Driver

In [6]:
driver.quit()

# Scrape Stock Details URL

## Company Profiles

In [7]:
def get_company_profiles(driver, stock):
    while True:
        try:
            company_profiles_url = 'https://www.idx.co.id/primary/ListedCompany/GetCompanyProfilesDetail?KodeEmiten=' + stock
            driver.get(company_profiles_url)

            WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

            CompanyProfilesContent = driver.find_element(By.TAG_NAME, value='body').text
            CompanyProfilesRow = pd.DataFrame(json.loads(CompanyProfilesContent)['Profiles'])
            CompanyProfilesRow.insert(0, 'StockCode', stock)
            
            break
        except JSONDecodeError as e:
            time.sleep(1.5)
#             print(stock, 'Company Profiles JSON is not available!', 'Retrying!')
    
    time.sleep(1)    
    
    return CompanyProfilesRow

## Today Trading Info

In [8]:
def get_today_trading_info(driver, stock):
    while True:
        try:
            trading_info_url = 'https://www.idx.co.id/primary/ListedCompany/GetTradingInfoSS?code={}&start=0&length=10000'.format(stock)
            driver.get(trading_info_url)

            WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

            TradingInfoContent = driver.find_element(By.TAG_NAME, value='body').text
            TradingInfoRows = pd.DataFrame(json.loads(TradingInfoContent)['replies'])
            
            break
        except JSONDecodeError as e:
            time.sleep(1.5)
#             print(stock, 'Trading Info JSON is not available!', 'Retrying!')
    
    time.sleep(1)
    
    return TradingInfoRows

## Financial Reports File Links
Maximum last 2 years (Current: 2023, Min: 2022)

Code will only find for any missing data, previous available data won't be overwritten.

In [9]:
def get_financial_report_file_links(driver, stock, prev_financial_report):
    current_year = datetime.now().year
    # last 2 years
    years = [current_year, current_year-1]
    periods = ['TW1', 'TW2', 'TW3', 'Audit']
    
    FinancialReportRows = pd.DataFrame()

    for year in years:
        for period in periods:
            if len(prev_financial_report_stock[(prev_financial_report_stock['Report_Period'] == period) & (prev_financial_report_stock['Report_Year'] == year)]) > 0:
                continue
            else:
                while True:
                    try:
                        financial_report_url = 'https://www.idx.co.id/primary/ListedCompany/GetFinancialReport?periode={}&year={}&indexFrom=0&pageSize=1000&reportType=rdf&kodeEmiten={}'.format(period, year, stock)
                        driver.get(financial_report_url)

                        WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.TAG_NAME, 'body'))

                        FinancialReportContent = driver.find_element(By.TAG_NAME, value='body').text

                        if json.loads(FinancialReportContent)['ResultCount'] > 0:
                            FinancialReportRow = pd.DataFrame(json.loads(FinancialReportContent)['Results'][0]['Attachments'])
                            FinancialReportRow = FinancialReportRow.rename(columns={'Emiten_Code':'StockCode'})
                            FinancialReportRows = pd.concat([FinancialReportRows, FinancialReportRow])

                        break
                    except JSONDecodeError as e:
                        time.sleep(1.5)
    #                     print(stock, year, period, 'JSON is not available!', 'Retrying!')

        time.sleep(2)

    return FinancialReportRows

## Multithreading Scrape

### Worker Function

In [10]:
# Define a worker function that takes stock codes from the queue and loads them in parallel
def load_stock(stock, prev_financial_report_stock):
    options = Options()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)

    company_profiles = get_company_profiles(driver, stock)
    today_trading_info = get_today_trading_info(driver, stock)
    financial_report_links = get_financial_report_file_links(driver, stock, prev_financial_report_stock)
    
    return company_profiles, today_trading_info, financial_report_links

### Create list to store scraped data

In [11]:
prev_financial_report_df = pd.read_excel('stocks.xlsx', sheet_name='Financial Reports')

In [12]:
results = {
    'CompanyProfiles':[],
    'TodayTradingInfo':[],
    'FinancialReportLinks':[]
}

In [13]:
prev_financial_report_df = pd.read_excel('stocks.xlsx', sheet_name='Financial Reports')

with tqdm(total=len(BEIStockSummaryDF['StockCode'])) as pbar:
    with ThreadPoolExecutor(max_workers=7) as executor:
        futures = []

        for StockCode in BEIStockSummaryDF['StockCode']:
            prev_financial_report_stock = prev_financial_report_df[prev_financial_report_df['StockCode'] == StockCode]
            future = executor.submit(load_stock, StockCode, prev_financial_report_stock)
            futures.append(future)
            
        for future in as_completed(futures):
            pbar.update(1)
            company_profiles, today_trading_info, financial_report_links = future.result()
            results['CompanyProfiles'].append(company_profiles)
            results['TodayTradingInfo'].append(today_trading_info)
            results['FinancialReportLinks'].append(financial_report_links)

100%|████████████████████████████████████████████████████████████████████████████████| 855/855 [26:43<00:00,  1.88s/it]


## Join All Stock Details

In [14]:
CompanyProfilesDF = pd.concat(results['CompanyProfiles']).reset_index(drop=True).drop(
    columns=[
        'DataID', 'Divisi', 'EfekEmiten_EBA', 'EfekEmiten_ETF', 
        'EfekEmiten_Obligasi', 'EfekEmiten_SPEI', 'EfekEmiten_Saham',
        'id', 'KodeDivisi', 'JenisEmiten', 'KodeEmiten', 'Status'
    ]
)
CompanyProfilesDF['TanggalPencatatan'] = pd.to_datetime(CompanyProfilesDF['TanggalPencatatan']).dt.normalize()
CompanyProfilesDF['Logo'] = ['https://www.idx.co.id' + logo for logo in CompanyProfilesDF['Logo']]
CompanyProfilesDF['LastScraped'] = datetime.now()
CompanyProfilesDF

Unnamed: 0,StockCode,Alamat,BAE,Industri,SubIndustri,Email,Fax,KegiatanUsahaUtama,NamaEmiten,NPKP,NPWP,PapanPencatatan,Sektor,SubSektor,TanggalPencatatan,Telepon,Website,Logo,LastScraped
0,ADCP,"Jl. Pengantin Ali No. 88, Ciracas\r\nKota Jaka...",,Pengelola & Pengembang Real Estat,Pengembang & Operator Real Estat,corsec@adcp.co.id,(021) 228 220 81,Perhotelan dan Real Estate,PT Adhi Commuter Properti Tbk,,85.227.029.7-093.000,Utama,Properti & Real Estat,Properti & Real Estat,2021-05-21,(021) 228 229 80,www.adcp.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
1,ADES,"Jl. Let. Jend. TB Simatupang No. 89, RT 01 RW ...",PT. Raya Saham Registra,Minuman,Minuman Ringan,wisnu.adji@akashainternational.com,021 - 78845549; 78845547,Minuman dan Makanan Ringan,Akasha Wira International Tbk Tbk,-,01.371.491.0-054.000,Pengembangan,Barang Konsumen Primer,Makanan & Minuman,1994-06-13,081119345000,www.akashainternational.com,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
2,ABMM,Gedung TMT 1 Lantai 18 Jl. Cilandak KKO No. 1 ...,PT. Datindo Entrycom,Perusahaan Holding Multi-sektor,Perusahaan Holding Multi-sektor,corporate.secretary@abm-investama.co.id,021-29976768,berupa jasa penyewaan namun tidak terbatas pad...,ABM Investama Tbk,PEM-01535/WPJ.04/KP.1003/2008,02.504.191.4-054.000,Utama,Perindustrian,Perusahaan Holding Multi Sektor,2011-12-06,021-29976767,www.abm-investama.com,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
3,ABBA,"Sahid Sudirman Centre Lt. 10, Jl. Jend. Sudirm...",PT. Adimitra Jasa Korpora,Media,Penerbitan,corsec@mahakax.com,(021) 573 9210,Media dan Percetakan,Mahaka Media Tbk,0,01.609.052.4-054.000,Pengembangan,Barang Konsumen Non-Primer,Media & Hiburan,2002-04-03,(021) 573 9203,www.mahakax.com,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
4,AALI,Jl Pulo Ayang Raya Blok OR No. 1\r\nKawasan In...,PT. Raya Saham Registra,Produk Makanan Pertanian,Perkebunan & Tanaman Pangan,Investor@astra-agro.co.id,"461-6655, 461-6677, 461-6688",Agriculture Plantation,Astra Agro Lestari Tbk,,01.334.427.0-054.000,Utama,Barang Konsumen Primer,Makanan & Minuman,1997-12-09,461-65-55,http://www.astra-agro.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
847,ZATA,"Komplek Industri Prapanca No.24, Cigondewah Ka...",PT. Adimitra Jasa Korpora,Ritel Khusus,Ritel Pakaian & Tekstil,corporate@elcorps.com,,Aktivitas Perusahaan Holding dan aktivitas Usa...,PT Bersama Zatta Jaya Tbk,,31.569.863.9-428.000,Utama,Barang Konsumen Non-Primer,Perdagangan Ritel,2022-11-10,(022) 86017900,www.elcorps.com,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
848,ZBRA,"Satrio Tower Lt. 23, Jl. Prof Dr. Satrio Kav. ...",PT BSR Indonesia,Perusahaan Holding Multi-sektor,Perusahaan Holding Multi-sektor,legal@zebranusantara.co.id,(021) 2788 3914,Perusahaan Holding dan Angkutan Bermotor untuk...,PT Dosni Roha Indonesia Tbk,,01.451.869.0-054.000,Pengembangan,Perindustrian,Perusahaan Holding Multi Sektor,1991-08-01,(021) 2788 3900,www.zebranusantara.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
849,ZONE,"Jalan Karet Pedurenan No. 240, Karet Kuningan,...",PT. Bima Registra,Ritel Khusus,Ritel Pakaian & Tekstil,corpsec@megaperintis.co.id,(021) 5290 5103,Perdagangan retail,PT Mega Perintis Tbk.,,02.433.917.8-011.000,Pengembangan,Barang Konsumen Non-Primer,Perdagangan Ritel,2018-12-12,(021) 5733 888; (021) 5290 4379,www.megaperintis.co.id,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306
850,ZINC,"Jl. Pantai Indah Selatan I, Elang Laut Blok A ...",,Logam & Mineral,Logam & Mineral Lainnya,corsec@ptkpc.com,021 - 29676234,Pertambangan dan Perdagangan,PT Kapuas Prima Coal Tbk,,02.386.972.0-091.000,Pengembangan,Barang Baku,Barang Baku,2017-10-16,021 - 29676236,www.ptkpc.com,https://www.idx.co.id/Portals/0/StaticData/Lis...,2023-03-19 20:14:57.589306


In [15]:
prev_trading_info = pd.read_excel('stocks.xlsx', sheet_name='Trading Info')
TradingInfoDF = pd.concat(results['TodayTradingInfo']).drop(columns=['No', 'Remarks']).reset_index(drop=True)
TradingInfoDF['Date'] = pd.to_datetime(TradingInfoDF['Date'])
TradingInfoDF['LastScraped'] = datetime.now()
TradingInfoDF = pd.concat([TradingInfoDF, prev_trading_info]).sort_values(by='Date').drop_duplicates(subset=['StockCode', 'Date'], keep='first').reset_index(drop=True)
TradingInfoDF

Unnamed: 0,IDStockSummary,Date,StockCode,StockName,Previous,OpenPrice,FirstTrade,High,Low,Close,...,ListedShares,TradebleShares,WeightForIndex,ForeignSell,ForeignBuy,DelistingDate,NonRegularVolume,NonRegularValue,NonRegularFrequency,LastScraped
0,2676848,2020-01-02,RMBA,Bentoel Internasional Investama Tbk.,330.0,0.0,0.0,350.0,316.0,316.0,...,3.640114e+10,3.640114e+10,3.640114e+10,0.0,0.0,,0.0,0.000000e+00,0.0,2023-03-19 20:14:58.477489
1,2676705,2020-01-02,MAPI,Mitra Adiperkasa Tbk.,1055.0,0.0,0.0,1070.0,1035.0,1065.0,...,1.660000e+10,1.660000e+10,1.660000e+10,3216100.0,1185700.0,,478050.0,4.962172e+08,2.0,2023-03-19 20:14:58.477489
2,2676978,2020-01-02,WSKT,Waskita Karya (Persero) Tbk.,1485.0,1485.0,1485.0,1515.0,1480.0,1505.0,...,1.357395e+10,1.357395e+10,1.357395e+10,554100.0,770300.0,,3281075.0,5.137233e+09,5.0,2023-03-19 20:14:58.477489
3,2676465,2020-01-02,CFIN,Clipan Finance Indonesia Tbk.,290.0,0.0,0.0,280.0,270.0,276.0,...,3.984520e+09,3.984520e+09,3.984520e+09,86700.0,500.0,,0.0,0.000000e+00,0.0,2023-03-19 20:14:58.477489
4,2676518,2020-01-02,ELSA,Elnusa Tbk.,306.0,0.0,0.0,310.0,300.0,302.0,...,7.298500e+09,7.298500e+09,7.298500e+09,13400.0,29300.0,,10.0,3.020000e+03,1.0,2023-03-19 20:14:58.477489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593110,3298603,2023-03-17,MARI,Mahaka Radio Integra Tbk.,76.0,0.0,0.0,88.0,71.0,78.0,...,5.252644e+09,5.252644e+09,1.758060e+09,1105900.0,42300.0,,0.0,0.000000e+00,0.0,2023-03-19 20:14:58.477489
593111,3298248,2023-03-17,BMRI,Bank Mandiri (Persero) Tbk.,9850.0,9950.0,9975.0,10125.0,9925.0,10100.0,...,4.620000e+10,4.620000e+10,1.844766e+10,60233600.0,67946400.0,,1740757.0,1.721379e+10,13.0,2023-03-19 20:14:58.477489
593112,3298809,2023-03-17,SDRA,Bank Woori Saudara Indonesia 1906 Tbk.,570.0,0.0,0.0,580.0,570.0,575.0,...,8.482552e+09,8.482552e+09,7.091413e+08,0.0,2300.0,,0.0,0.000000e+00,0.0,2023-03-19 20:14:58.477489
593113,3298600,2023-03-17,MAPA,Map Aktif Adiperkasa Tbk.,4520.0,0.0,0.0,4600.0,4510.0,4600.0,...,2.850400e+09,2.850400e+09,6.627180e+08,191900.0,78300.0,,26.0,1.094600e+05,1.0,2023-03-19 20:14:58.477489


In [16]:
FinancialReportLinksDF = pd.concat(results['FinancialReportLinks']).reset_index(drop=True).drop(
    columns=['File_ID', 'File_Size', 'File_Type']
)
FinancialReportLinksDF['File_Modified'] = pd.to_datetime(FinancialReportLinksDF['File_Modified']).dt.normalize()
FinancialReportLinksDF['File_Path'] = 'https://www.idx.co.id/' + FinancialReportLinksDF['File_Path']
FinancialReportLinksDF['LastScraped'] = datetime.now()
FinancialReportLinksDF = pd.concat([FinancialReportLinksDF, prev_financial_report_df]).reset_index(drop=True)
FinancialReportLinksDF

Unnamed: 0,StockCode,File_Modified,File_Name,File_Path,Report_Period,Report_Type,Report_Year,NamaEmiten,LastScraped
0,ADCP,2022-07-29,FinancialStatement-2022-II-ADCP.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW2,rdf,2022,PT Adhi Commuter Properti Tbk,2023-03-19 20:14:59.187510
1,ADCP,2022-07-29,LK ADCP Q2 2022 - Final_Signed.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW2,rdf,2022,PT Adhi Commuter Properti Tbk,2023-03-19 20:14:59.187510
2,ADCP,2022-07-29,instance.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW2,rdf,2022,PT Adhi Commuter Properti Tbk,2023-03-19 20:14:59.187510
3,ADCP,2022-07-29,FinancialStatement-2022-II-ADCP.xlsx,https://www.idx.co.id//Portals/0/StaticData/Li...,TW2,rdf,2022,PT Adhi Commuter Properti Tbk,2023-03-19 20:14:59.187510
4,ADCP,2022-07-29,inlineXBRL.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW2,rdf,2022,PT Adhi Commuter Properti Tbk,2023-03-19 20:14:59.187510
...,...,...,...,...,...,...,...,...,...
27683,ZYRX,2022-12-22,SDP LK Q3 2022.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk,2023-03-19 00:00:00.000000
27684,ZYRX,2022-12-22,Q3 Report PT Zyrexindo Mandiri Buana Tbk.pdf,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk,2023-03-19 00:00:00.000000
27685,ZYRX,2022-12-22,Penjelasan Perubahan Aset - Liabilitas 30 Sep ...,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk,2023-03-19 00:00:00.000000
27686,ZYRX,2022-12-22,instance.zip,https://www.idx.co.id//Portals/0/StaticData/Li...,TW3,rdf,2022,PT Zyrexindo Mandiri Buana Tbk,2023-03-19 00:00:00.000000


## Append Previous Trading Day Data

## Export to Excel

In [18]:
with pd.ExcelWriter('stocks.xlsx') as writer:
    CompanyProfilesDF.to_excel(writer, sheet_name='Company Profiles', index=False)
    TradingInfoDF.to_excel(writer, sheet_name='Trading Info', index=False)
    FinancialReportLinksDF.to_excel(writer, sheet_name='Financial Reports', index=False)