# Bursa Malaysia Scraper

- Author: Kelvin You
- Date: 2025-01-19
- Description: This script is used to scrape data from Bursa Malaysia website.

In [4]:
import numpy as np, pandas as pd
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import time, sleep
from tqdm import tqdm

### Get all listed company names and stock codes from Bursa Malaysia

In [27]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0 Safari/537.36')
driver = webdriver.Chrome(options=options)
driver.maximize_window()

# Open target website
driver.get('http://www.bursamalaysia.com/market/listed-companies/list-of-companies/main-market/')

# Wait for table to load
wait = WebDriverWait(driver, 20)
table = wait.until(EC.presence_of_element_located((By.ID, 'DataTables_Table_0')))

codes, names = [], []  # Store stock codes and company names

def get_last_page_number():
    # Wait for the pagination to be visible
    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'pagination')))
    
    # Wait for the page items to be loaded
    pagination = driver.find_element(By.CLASS_NAME, 'pagination')
    page_items = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, 'paginate_button'))
    )
    
    last_page = None

    # Iterate over the page items and find the last one with a page number
    for item in page_items:
        if item.text.strip().isdigit():
            last_page = int(item.text.strip())

    return last_page

# Get the last page number
last_page_number = get_last_page_number()
print(f"Last page number is: {last_page_number}")

while True:
    try:
        # Wait for rows to load
        tbody = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'tbody')))
        rows = tbody.find_elements(By.TAG_NAME, 'tr')
        
        # Iterate over each row and extract data
        for row in rows:
            try:
                # Extract stock code and company name
                stock_link = row.find_element(By.CLASS_NAME, 'company-announcement-link')
                stock_code = stock_link.get_attribute('href').split('=')[-1]
                name = stock_link.text.strip()

                codes.append(stock_code)
                names.append(name)
            except Exception as e:
                print(f"Error processing row: {e}")

        # Wait for the "Next" button to be clickable
        next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
        
        # Scroll to the "Next" button to ensure it's in view
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        sleep(1)  # Small delay to ensure visibility

        pagination = driver.find_element(By.CLASS_NAME, 'pagination')
        current_page_link = pagination.find_elements(By.CLASS_NAME, 'active')[0]
        current_page = int(current_page_link.text)
        # Check if the "Next" button is disabled
        if current_page == last_page_number:
            print("Reached the last page.")
            break  # Exit loop if it's the last page
        else:
            # Trigger a click using JavaScript
            driver.execute_script("arguments[0].click();", next_button)
            sleep(2)  # Wait for the next page to load

    except Exception as e:
        print(f"Error during scraping: {e}")
        break

# Close the browser
driver.quit()

# Print results
print("Stock Codes:", codes)
print("Company Names:", names)


KeyboardInterrupt: 

In [6]:
stock_code = pd.DataFrame({'code':codes,'name':names})
# stock_code.loc[370,'code'] = '5235SS'
# stock_code.drop(index=371,inplace=True)
# stock_code.reset_index(drop=True)
print(stock_code.shape)

(811, 2)


Unnamed: 0,code,name
0,5250,7-ELEVEN MALAYSIA HOLDINGS BERHAD


In [9]:
output_directory = 'data'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the CSV file
file_path = os.path.join(output_directory, '1_stock_code.csv')
stock_code.to_csv(file_path, index=False)

TODO:
1. Scrape company financial data
2. Scrape company announcement data
3. Scrape company stock price data
4. Scrape company stock price data
5. Scrape company stock price data
6. Scrape company stock price data
7. Scrape company stock price data
8. Scrape company stock price data
9. Scrape company stock price data
10. Scrape company stock price data

In [34]:
# Load the stock code data from CSV
stock_code_data = pd.read_csv('data/1_stock_code.csv')

start_time = time()

options = Options()
driver = webdriver.Chrome(options=options)
driver.switch_to.window(driver.current_window_handle)
driver.maximize_window()
error, result = [],[]
for i in tqdm(stock_code_data.code):
    url = 'https://www.klsescreener.com/v2/stocks/view/' + str(i)
    driver.get(url)

    wait = WebDriverWait(driver, 10)

    last_fy_dates = [] # list of financial years and stocks data
    total_fy_dates = 10 # number of financial years to scrape

    try:
        # Get category of the stock
        # <span class="text-muted">Main Market : Banking</span>

        # how to get the text behind "Main Market : "
        # which example here should be Banking
        cat = driver.find_element(By.CLASS_NAME, 'text-muted').text.split(' : ')[-1]

        x = driver.find_element(By.CLASS_NAME, 'financial_reports')
        tbody = x.find_element(By.TAG_NAME, 'tbody')
        rows = tbody.find_elements(By.TAG_NAME, 'tr')

        for row in rows:
            # Skip the header row
            if 'strong' in row.get_attribute('innerHTML'):
                continue

            # Extract the data from each column
            columns = row.find_elements(By.TAG_NAME, 'td')
            if len(columns) >= 12:
                revenue = columns[3].text  # Revenue
                pl = columns[4].text  # P/L
                quarter = columns[5].text  # Quarter
                quarter_date = columns[6].text  # Quarter Date
                financial_year = columns[7].text  # Financial Year

                if financial_year not in last_fy_dates:
                    last_fy_dates.append(financial_year)
            if len(last_fy_dates) == total_fy_dates:
                break
            else:
                result.append([i, cat, financial_year, revenue, pl, quarter, quarter_date])
                
    except Exception as e:
        print(f"Error retrieving items: {e}")

driver.quit()

print(result)

print('Time taken: ',time()-start_time)




 33%|███▎      | 269/811 [08:35<17:18,  1.92s/it]


KeyboardInterrupt: 

In [33]:
data = pd.DataFrame(result, columns=['code', 'category', 'financial_year', 'revenue', 'profit_loss', 'quarter', 'quarter_date'])
data.to_csv('data/2_financial_data.csv', index=False)

In [46]:
financial_data = pd.read_csv('data/2_financial_data.csv')

# YoY and QoQ growth calculation
def convert_to_number(value):
    if 'm' in value:
        return float(value.replace('m', '')) * 1e6  # Convert millions to actual number
    elif 'b' in value:
        return float(value.replace('b', '')) * 1e9  # Convert billions to actual number
    else:
        return float(value)  # If no suffix, assume it's already a number
financial_data['revenue'] = financial_data['revenue'].apply(convert_to_number)
financial_data['profit_loss'] = financial_data['profit_loss'].apply(convert_to_number)
financial_data['quarter'] = financial_data['quarter'].astype(int)

# Sort by financial_year and quarter
financial_data = financial_data.sort_values(by=['financial_year', 'quarter'], ascending=[True, True])

# Calculate QoQ Growth for Revenue and Profit/Loss
financial_data['revenue_qoq_growth'] = financial_data['revenue'].pct_change() * 100
financial_data['profit_loss_qoq_growth'] = financial_data['profit_loss'].pct_change() * 100

# Calculate YoY Growth for Revenue and Profit/Loss
financial_data['revenue_yoy_growth'] = financial_data.groupby('quarter')['revenue'].pct_change() * 100
financial_data['profit_loss_yoy_growth'] = financial_data.groupby('quarter')['profit_loss'].pct_change() * 100

financial_data = financial_data.sort_values(by=['financial_year', 'quarter'], ascending=[False, False])
financial_data.to_csv('data/3_financial_data_growth.csv', index=False)