In [10]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import requests

In [9]:
# Base URL of the PSE Edge company directory
base_url = "https://edge.pse.com.ph/companyDirectory/search.ax"

# Empty list to hold results
all_data = []

# Get data from all pages
for page in range(1, 7):
    payload = {
        'pageNo': page,
        'keyword': '',
        'sortType': '',
        'dateSortType': 'DESC',
        'cmpySortType': 'ASC',
        'symbolSortType': 'ASC',
        'sector': 'ALL',
        'subsector': ''
    }
    
    # Send POST request (the site loads data via AJAX)
    response = requests.post(base_url, data=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all company rows inside the main table
    rows = soup.select('table.list tbody tr')
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) == 5:
            company_name = cols[0].text.strip()
            stock_symbol = cols[1].text.strip()
            sector = cols[2].text.strip()
            subsector = cols[3].text.strip()
            listing_date = cols[4].text.strip()
            
            all_data.append({
                "Company Name": company_name,
                "Stock Symbol": stock_symbol,
                "Sector": sector,
                "Subsector": subsector
            })
    
    # Take break from requests
    time.sleep(1)

# Convert to df
df_pse = pd.DataFrame(all_data)

In [11]:
# Define your target sectors and subsectors
target_sectors = [
    "Industrial",
    "Holding Firms"
]

target_subsectors = [
    "Electricity, Energy, Power, & Water",
    "Construction, Infra. & Allied Services",
    "Holding Firms"
]

# Filter df_pse where either Sector or Subsector matches
df_pse_final = df_pse[
    (df_pse['Sector'].isin(target_sectors)) |
    (df_pse['Subsector'].isin(target_subsectors))
].copy()

# Reset index for cleanliness
df_pse_final.reset_index(drop=True, inplace=True)

df_pse_final = df_pse_final[['Company Name', 'Stock Symbol']]

df_pse_final.to_csv('filtered_pse_companies.csv', index=False)

---

In [12]:
df_companies = pd.read_csv("filtered_pse_companies.csv")

In [13]:
class PSEEdgeScraper:
    def __init__(self):
        self.base_url = "https://edge.pse.com.ph"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        # Selenium setup
        options = Options()
        options.headless = True
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(options=options)

    def get_company_ids(self, symbol):
        """Get company_id and security_id with multiple search methods"""
        try:
            print(f"  Searching for {symbol}...")
            search_url = f"{self.base_url}/autoComplete/searchCompanyNameSymbol.ax"
            params = {'term': symbol}
            response = self.session.get(search_url, params=params)
            if response.status_code == 200:
                companies = response.json()
                for company in companies:
                    if company.get('symbol') == symbol:
                        print(f"  Found {symbol}: company_id={company.get('cmpyId')}")
                        return company.get('cmpyId'), company.get('securityId')

            print(f"  Trying directory search for {symbol}...")
            for page in range(1, 10):
                directory_url = f"{self.base_url}/companyDirectory/search.ax"
                data = {
                    'pageNo': str(page),
                    'keyword': symbol,
                    'sector': 'ALL',
                    'subsector': 'ALL'
                }
                response = self.session.post(directory_url, data=data)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    company_links = soup.find_all('a', string=symbol)
                    for link in company_links:
                        onclick = link.get('onclick', '')
                        match = re.search(r"cmDetail\('(\d+)','(\d+)'\)", onclick)
                        if match:
                            company_id = match.group(1)
                            security_id = match.group(2)
                            print(f"  Found {symbol} on page {page}: company_id={company_id}, security_id={security_id}")
                            return company_id, security_id
            print(f"  Could not find {symbol} in directory")
            return None, None
        except Exception as e:
            print(f"  Error getting company IDs for {symbol}: {e}")
            return None, None

    def get_financial_reports(self, company_id, symbol, company_name):
        """Get financial reports using Selenium"""
        try:
            print(f"  Fetching financial reports for {symbol}...")
            url = f"{self.base_url}/companyPage/financial_reports_view.do?cmpy_id={company_id}"
            self.driver.get(url)

            # Wait for the first Balance Sheet table to appear (max 15s)
            try:
                WebDriverWait(self.driver, 15).until(
                    EC.presence_of_element_located((By.XPATH, "//table[caption[text()='Balance Sheet']]"))
                )
            except:
                print(f"  Table did not load for {symbol}")
                return None

            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # Check for no data message
            no_data_msg = soup.find('p', class_='textCont')
            if no_data_msg and "will become available upon submission" in no_data_msg.text:
                print(f"  No financial data available for {symbol}")
                return None

            financial_data = {
                'Symbol': symbol,
                'Company_Name': company_name,
                'Company_ID': company_id,
                'Scraped_Date': datetime.now().strftime('%Y-%m-%d')
            }

            # Parse tables
            for table in soup.find_all('table', class_='view'):
                caption = table.find('caption')
                if caption:
                    if 'Balance Sheet' in caption.text:
                        financial_data.update(self.parse_financial_table(table, 'Annual_BS'))
                    elif 'Income Statement' in caption.text:
                        financial_data.update(self.parse_financial_table(table, 'Annual_IS'))

            data_keys = [k for k in financial_data.keys() if k not in ['Symbol','Company_Name','Company_ID','Scraped_Date']]
            if not data_keys:
                print(f"  No financial data extracted for {symbol}")
                return None

            print(f"  Successfully extracted {len(data_keys)} data points for {symbol}")
            return financial_data

        except Exception as e:
            print(f"  Error getting financial reports for {symbol}: {e}")
            return None

    def parse_financial_table(self, table, prefix):
        """Parse Balance Sheet or Income Statement table into dictionary"""
        data = {}
        try:
            rows = table.find_all('tr')
            for row in rows[1:]:
                cells = row.find_all(['th','td'])
                if len(cells) >= 3:
                    item_name = cells[0].get_text(strip=True)
                    current_val = cells[1].get_text(strip=True)
                    previous_val = cells[2].get_text(strip=True)

                    col_name = re.sub(r"[^\w\s]", "", item_name).replace(" ","_")

                    def parse_num(val):
                        val = val.replace(",","")
                        try:
                            return float(val)
                        except:
                            return None

                    if current_val and current_val != '-':
                        data[f"{prefix}_{col_name}_Current"] = parse_num(current_val)
                    if previous_val and previous_val != '-':
                        data[f"{prefix}_{col_name}_Previous"] = parse_num(previous_val)
            return data
        except Exception as e:
            print(f"Error parsing table: {e}")
            return data

    def scrape_all_companies(self, df_companies):
        all_financials = []
        for index, row in df_companies.iterrows():
            symbol = row['Stock Symbol']
            company_name = row['Company Name']
            print(f"Scraping {company_name} ({symbol})...")
            company_id, security_id = self.get_company_ids(symbol)
            if company_id:
                financial_data = self.get_financial_reports(company_id, symbol, company_name)
                if financial_data:
                    all_financials.append(financial_data)
                    print(f"✓ Successfully scraped {symbol}")
                else:
                    print(f"✗ No financial data available for {symbol}")
            else:
                print(f"✗ Could not find company IDs for {symbol}")
            time.sleep(2)
        return pd.DataFrame(all_financials)

# --- Test the scraper ---
if __name__ == "__main__":
    test_companies = pd.DataFrame({
        'Company Name': ['ACEN CORPORATION','Ayala Corporation','Manila Electric Company','BDO Unibank, Inc.'],
        'Stock Symbol': ['ACEN','AC','MER','BDO']
    })
    scraper = PSEEdgeScraper()
    df = scraper.scrape_all_companies(test_companies)
    if not df.empty:
        print(df.head())
        df.to_csv('pse_financial_data.csv', index=False)
        print("Saved to pse_financial_data.csv")

Scraping ACEN CORPORATION (ACEN)...
  Searching for ACEN...
  Found ACEN: company_id=233
  Fetching financial reports for ACEN...
  No financial data available for ACEN
✗ No financial data available for ACEN
Scraping Ayala Corporation (AC)...
  Searching for AC...
  Found AC: company_id=57
  Fetching financial reports for AC...
  No financial data available for AC
✗ No financial data available for AC
Scraping Manila Electric Company (MER)...
  Searching for MER...
  Found MER: company_id=118
  Fetching financial reports for MER...
  No financial data available for MER
✗ No financial data available for MER
Scraping BDO Unibank, Inc. (BDO)...
  Searching for BDO...
  Found BDO: company_id=260
  Fetching financial reports for BDO...
  No financial data available for BDO
✗ No financial data available for BDO
