# Bursa Malaysia Scraper

- Author: Kelvin You
- Date: 2025-01-19
- Description: This script is used to scrape data from Bursa Malaysia website.

In [4]:
import numpy as np, pandas as pd
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import time, sleep
from tqdm import tqdm

### Get all listed company names and stock codes from Bursa Malaysia

In [66]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0 Safari/537.36')
driver = webdriver.Chrome(options=options)
driver.maximize_window()

# Open target website
driver.get('http://www.bursamalaysia.com/market/listed-companies/list-of-companies/main-market/')

# Wait for table to load
wait = WebDriverWait(driver, 20)
table = wait.until(EC.presence_of_element_located((By.ID, 'DataTables_Table_0')))

codes, names = [], []  # Store stock codes and company names

def get_last_page_number():
    # Wait for the pagination to be visible
    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'pagination')))
    
    page_items = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, 'paginate_button'))
    )
    
    last_page = None

    # Iterate over the page items and find the last one with a page number
    for item in page_items:
        if item.text.strip().isdigit():
            last_page = int(item.text.strip())

    return last_page

# Get the last page number
last_page_number = get_last_page_number()
print(f"Last page number is: {last_page_number}")

record_counter = 0

while True:
    try:
        # Wait for rows to load
        tbody = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'tbody')))
        rows = tbody.find_elements(By.TAG_NAME, 'tr')
        
        # Iterate over each row and extract data
        for row in rows:
            try:
                record_counter += 1
                stock_link = row.find_element(By.CLASS_NAME, 'company-announcement-link')
                name = stock_link.text.strip()

                if record_counter == 367:
                    stock_code = "5235SS"
                else:
                    stock_code = stock_link.get_attribute('href').split('=')[-1]

                # Increment the record counter


                codes.append(stock_code)
                names.append(name)
            except Exception as e:
                print(f"Error processing row: {e}")

        # Wait for the "Next" button to be clickable
        next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
        
        # Scroll to the "Next" button to ensure it's in view
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        sleep(1)  # Small delay to ensure visibility

        pagination = driver.find_element(By.CLASS_NAME, 'pagination')
        current_page_link = pagination.find_elements(By.CLASS_NAME, 'active')[0]
        current_page = int(current_page_link.text)
        # Check if the "Next" button is disabled
        if current_page == last_page_number:
            print("Reached the last page.")
            break  # Exit loop if it's the last page
        else:
            # Trigger a click using JavaScript
            driver.execute_script("arguments[0].click();", next_button)
            sleep(2)  # Wait for the next page to load

    except Exception as e:
        print(f"Error during scraping: {e}")
        break

# Close the browser
driver.quit()

# Print results
print("Stock Codes:", codes)
print("Company Names:", names)


Last page number is: 28
Reached the last page.
Stock Codes: ['5250', '5326', '7214', '7167', '7086', '5198', '7131', '1481', '5281', '7191', '7146', '6599', '5139', '5185', '7315', '7078', '5238', '7219', '2658', '7609', '5116', '5269', '5115', '2674', '2488', '1163', '5127', '5293', '5307', '5120', '1015', '7031', '6351', '7083', '4758', '6556', '5082', '5568', '5088', '7090', '5015', '6432', '7181', '7007', '7218', '7722', '7129', '4057', '7020', '7162', '7054', '6399', '8176', '7048', '5130', '5302', '5025', '5182', '8885', '5204', '7579', '6888', '5106', '7120', '2305', '5021', '5329', '7005', '5258', '7251', '1899', '6602', '5190', '3239', '3395', '5196', '4219', '5248', '9814', '7668', '6173', '5932', '7195', '6998', '5032', '5069', '0168', '9288', '7036', '8133', '6297', '5100', '9938', '4162', '7221', '7188', '5210', '1818', '2828', '7174', '2852', '7128', '5105', '5311', '5099', '5180', '0163', '5257', '2836', '7076', '7035', '6947', '5195', '8052', '8982', '7209', '5273', '71

In [67]:
stock_code = pd.DataFrame({'code':codes,'name':names})
# stock_code.loc[370,'code'] = '5235SS'
# stock_code.drop(index=371,inplace=True)
# stock_code.reset_index(drop=True)
print(stock_code.shape)

(811, 2)


In [68]:
output_directory = 'data'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the CSV file
file_path = os.path.join(output_directory, '1_stock_code.csv')
stock_code.to_csv(file_path, index=False)

TODO:
1. Scrape company financial data
2. Scrape company announcement data
3. Scrape company stock price data

In [54]:
# Load the stock code data from CSV
stock_code_data = pd.read_csv('data/1_stock_code.csv')

start_time = time()

options = Options()
driver = webdriver.Chrome(options=options)
driver.switch_to.window(driver.current_window_handle)
driver.maximize_window()
error, result = [],[]
for i in tqdm(stock_code_data.code):
    url = 'https://www.klsescreener.com/v2/stocks/view/' + str(i)
    driver.get(url)

    wait = WebDriverWait(driver, 10)

    last_fy_dates = [] # list of financial years and stocks data
    total_fy_dates = 10 # number of financial years to scrape

    try:
        # Get category of the stock
        # <span class="text-muted">Main Market : Banking</span>

        # how to get the text behind "Main Market : "
        # which example here should be Banking
        cat = driver.find_element(By.CLASS_NAME, 'text-muted').text.split(' : ')[-1]

        x = driver.find_element(By.CLASS_NAME, 'financial_reports')
        tbody = x.find_element(By.TAG_NAME, 'tbody')
        rows = tbody.find_elements(By.TAG_NAME, 'tr')

        for row in rows:
            # Skip the header row
            if 'strong' in row.get_attribute('innerHTML'):
                continue

            # Extract the data from each column
            columns = row.find_elements(By.TAG_NAME, 'td')
            if len(columns) >= 12:
                revenue = columns[3].text  # Revenue
                pl = columns[4].text  # P/L
                quarter = columns[5].text  # Quarter
                quarter_date = columns[6].text  # Quarter Date
                financial_year = columns[7].text  # Financial Year

                if financial_year not in last_fy_dates:
                    last_fy_dates.append(financial_year)
            if len(last_fy_dates) == total_fy_dates:
                break
            else:
                result.append([i, cat, financial_year, revenue, pl, quarter, quarter_date])
                
    except Exception as e:
        print(f"Error retrieving items: {e}")

driver.quit()

print(result)

print('Time taken: ',time()-start_time)




100%|██████████| 811/811 [24:18<00:00,  1.80s/it]


[['5250', 'Retailers', '31 Dec, 2024', '744.0m', '10.9m', '3', '2024-09-30'], ['5250', 'Retailers', '31 Dec, 2024', '751.8m', '20.5m', '2', '2024-06-30'], ['5250', 'Retailers', '31 Dec, 2024', '684.2m', '12.8m', '1', '2024-03-31'], ['5250', 'Retailers', '31 Dec, 2023', '694.5m', '221.1m', '4', '2023-12-31'], ['5250', 'Retailers', '31 Dec, 2023', '705.3m', '14.4m', '3', '2023-09-30'], ['5250', 'Retailers', '31 Dec, 2023', '1.1b', '25.4m', '2', '2023-06-30'], ['5250', 'Retailers', '31 Dec, 2023', '976.9m', '15.7m', '1', '2023-03-31'], ['5250', 'Retailers', '31 Dec, 2022', '992.4m', '2.7m', '4', '2022-12-31'], ['5250', 'Retailers', '31 Dec, 2022', '988.2m', '13.7m', '3', '2022-09-30'], ['5250', 'Retailers', '31 Dec, 2022', '943.7m', '25.8m', '2', '2022-06-30'], ['5250', 'Retailers', '31 Dec, 2022', '840.0m', '24.4m', '1', '2022-03-31'], ['5250', 'Retailers', '31 Dec, 2021', '795.1m', '29.2m', '4', '2021-12-31'], ['5250', 'Retailers', '31 Dec, 2021', '680.2m', '1.4m', '3', '2021-09-30'], [

In [55]:
data = pd.DataFrame(result, columns=['code', 'category', 'financial_year', 'revenue', 'profit_loss', 'quarter', 'quarter_date'])
data.to_csv('data/2_financial_data.csv', index=False)

In [56]:
# Constants
MILLION = 1e6
BILLION = 1e9
THOUSAND = 1e3

def convert_to_number(value: str) -> float:
    """
    Convert a string with 'm' (millions) or 'b' (billions) suffix to a float.
    
    Args:
        value (str): The value to convert (e.g., '32.1m', '1.2b').
    
    Returns:
        float: The numeric value.
    """
    value = value.replace(',', '')
    if 'm' in value:
        return float(value.replace('m', '')) * MILLION
    elif 'b' in value:
        return float(value.replace('b', '')) * BILLION
    elif 'k' in value:
        return float(value.replace('k', '')) * THOUSAND
    return float(value)  # If no suffix, assume it's already a number

def calculate_growth(data: pd.DataFrame, column: str, groupby_columns: list = None) -> pd.Series:
    """
    Calculate growth rates (QoQ or YoY) for a given column, grouped by specified columns.
    
    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        column (str): The column to calculate growth for.
        groupby_columns (list, optional): The columns to group by for calculations.
    
    Returns:
        pd.Series: The growth rates as percentages.
    """
    if groupby_columns:
        return data.groupby(groupby_columns)[column].pct_change() * 100
    return data[column].pct_change() * 100

def main():
    # Load data
    financial_data = pd.read_csv('data/2_financial_data.csv')

    # Convert revenue and profit/loss columns to numeric values
    financial_data['revenue'] = financial_data['revenue'].apply(convert_to_number)
    financial_data['profit_loss'] = financial_data['profit_loss'].apply(convert_to_number)
    # financial_data['quarter'] = financial_data['quarter'].astype(int)

    # Sort by stock code, financial_year, and quarter
    financial_data = financial_data.sort_values(by=['code', 'financial_year', 'quarter'], ascending=[True, True, True])

    # Calculate QoQ Growth for Revenue and Profit/Loss (grouped by stock code)
    financial_data['revenue_qoq_growth'] = calculate_growth(financial_data, 'revenue', groupby_columns=['code'])
    financial_data['profit_loss_qoq_growth'] = calculate_growth(financial_data, 'profit_loss', groupby_columns=['code'])

    # Calculate YoY Growth for Revenue and Profit/Loss (grouped by stock code and quarter)
    financial_data['revenue_yoy_growth'] = calculate_growth(financial_data, 'revenue', groupby_columns=['code', 'quarter'])
    financial_data['profit_loss_yoy_growth'] = calculate_growth(financial_data, 'profit_loss', groupby_columns=['code', 'quarter'])

    # Sort by financial_year and quarter in descending order for final output
    financial_data = financial_data.sort_values(by=['financial_year', 'quarter'], ascending=[False, False])

    # Save the results to a new CSV file
    financial_data.to_csv('data/3_financial_data_growth.csv', index=False)

if __name__ == "__main__":
    main()