In [16]:
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager   # ✅ NEW

In [17]:
companies = pd.read_csv("filtered_pse_companies.csv")
companies

Unnamed: 0,Company Name,Stock Symbol
0,Asia Amalgamated Holdings Corporation,AAA
1,"AbaCore Capital Holdings, Inc.",ABA
2,Asiabest Group International Inc.,ABG
3,Ayala Corporation,AC
4,ACEN CORPORATION,ACEN
...,...,...
104,Vitarich Corporation,VITA
105,"Victorias Milling Company, Inc.",VMC
106,Vivant Corporation,VVT
107,"Wellex Industries, Incorporated",WIN


In [18]:
# Prepare target companies
company_targets = set(companies["Company Name"].str.strip().str.upper())
results = []

# Set up Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

# Open PSE company directory
driver.get("https://edge.pse.com.ph/companyDirectory/form.do")

# Determine total pages
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
pagination_elems = driver.find_elements(By.XPATH, "//div[@class='paging']//a | //div[@class='paging']//span")

page_numbers = [int(elem.text.strip()) for elem in pagination_elems if elem.text.strip().isdigit()]
if not page_numbers:
    raise Exception("Pagination numbers not detected — check the page structure or XPath.")
total_pages = max(page_numbers)
print(f"Total pages detected: {total_pages}")

# Loop through pages
for page_number in range(1, total_pages + 1):
    print(f"Processing page {page_number} of {total_pages}")

    rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//table[@class='list']//tbody//tr")))

    for row in rows:
        try:
            company_name_elem = row.find_element(By.XPATH, "./td[1]/a")
            company_name = company_name_elem.text.strip().upper()

            if company_name in company_targets:
                onclick_attr = company_name_elem.get_attribute("onclick")
                cmpy_id = onclick_attr.split("'")[1]
                financial_url = f"https://edge.pse.com.ph/companyPage/financial_reports_view.do?cmpy_id={cmpy_id}"

                print(f"Match found: {company_name} → cmpy_id={cmpy_id}")
                results.append({
                    "Company Name": company_name,
                    "Company ID": cmpy_id,
                    "Financial URL": financial_url
                })
        except:
            continue

    # Navigate to next page
    if page_number < total_pages:
        next_page = wait.until(
            EC.presence_of_element_located((By.XPATH, f"//div[@class='paging']/span/a[text()='{page_number + 1}']"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page)
        driver.execute_script("arguments[0].click();", next_page)
        time.sleep(1.2)

driver.quit()

# Save results to Excel
pd.DataFrame(results).to_excel("company_urls.xlsx", index=False)

Total pages detected: 6
Processing page 1 of 6
Match found: ASIA AMALGAMATED HOLDINGS CORPORATION → cmpy_id=55
Match found: ABACORE CAPITAL HOLDINGS, INC. → cmpy_id=174
Match found: ASIABEST GROUP INTERNATIONAL INC. → cmpy_id=176
Match found: AYALA CORPORATION → cmpy_id=57
Match found: ACEN CORPORATION → cmpy_id=233
Match found: ALSONS CONSOLIDATED RESOURCES, INC. → cmpy_id=121
Match found: ABOITIZ EQUITY VENTURES, INC. → cmpy_id=16
Match found: ALLIANCE GLOBAL GROUP, INC. → cmpy_id=212
Match found: ALTERNERGY HOLDINGS CORPORATION → cmpy_id=701
Match found: AGRINURTURE, INC. → cmpy_id=619
Match found: A. SORIANO CORPORATION → cmpy_id=14
Match found: ABOITIZ POWER CORPORATION → cmpy_id=609
Match found: ANGLO PHILIPPINE HOLDINGS CORPORATION → cmpy_id=52
Match found: RASLAG CORP. → cmpy_id=694
Match found: ATN HOLDINGS, INC. → cmpy_id=56
Match found: AXELUM RESOURCES CORP. → cmpy_id=673
Match found: BHI HOLDINGS, INC. → cmpy_id=62
Match found: BOGO-MEDELLIN MILLING COMPANY, INC. → cmpy_id

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Load company URLs
df = pd.read_excel("company_urls.xlsx")  # columns: Company Name, Company ID, Financial URL
results = []

# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

# Scrape annual financial reports
for _, row in df.iterrows():
    company_name = row["Company Name"]
    url = row["Financial URL"]
    print(f"Processing: {company_name}")
    driver.get(url)

    try:
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@class='view']")))
        annual_header = driver.find_element(By.XPATH, "//h3[text()='Annual']")
    except:
        print(f"No Annual data for {company_name}")
        continue

    # Extract fiscal period and currency
    try:
        header_text = annual_header.find_element(By.XPATH, "following-sibling::p[1]").text.replace("\n", " ")
        fiscal_period = header_text.split("ended :")[1].split("Currency")[0].strip()
        currency = header_text.split("Currency")[1].replace("(and units, if applicable)", "").replace(":", "").strip()
    except:
        fiscal_period = currency = "N/A"

    # Scrape tables between Annual header and next section
    tables = annual_header.find_elements(By.XPATH, "following-sibling::table[preceding-sibling::h3[1][text()='Annual']]")
    for tbl in tables:
        caption = tbl.find_element(By.TAG_NAME, "caption").text.strip()
        rows = tbl.find_elements(By.XPATH, ".//tbody/tr")[1:]  # skip header

        for r in rows:
            th = r.find_elements(By.TAG_NAME, "th")
            cols = r.find_elements(By.TAG_NAME, "td")
            if not th or len(cols) != 2:
                continue

            results.append({
                "Company Name": company_name,
                "Report Type": "Annual",
                "Statement Type": caption,
                "Fiscal Period": fiscal_period,
                "Currency": currency,
                "Item": th[0].text.strip(),
                "Current Year": cols[0].text.strip(),
                "Previous Year": cols[1].text.strip()
            })

driver.quit()

# Save results
pd.DataFrame(results).to_excel("Annual Financial Reports.xlsx", index=False)

Processing: ASIA AMALGAMATED HOLDINGS CORPORATION
Processing: ABACORE CAPITAL HOLDINGS, INC.
Processing: ASIABEST GROUP INTERNATIONAL INC.
Processing: AYALA CORPORATION
Processing: ACEN CORPORATION
Processing: ALSONS CONSOLIDATED RESOURCES, INC.
Processing: ABOITIZ EQUITY VENTURES, INC.
Processing: ALLIANCE GLOBAL GROUP, INC.
Processing: ALTERNERGY HOLDINGS CORPORATION
Processing: AGRINURTURE, INC.
Processing: A. SORIANO CORPORATION
Processing: ABOITIZ POWER CORPORATION
Processing: ANGLO PHILIPPINE HOLDINGS CORPORATION
Processing: RASLAG CORP.
Processing: ATN HOLDINGS, INC.
Processing: AXELUM RESOURCES CORP.
Processing: BHI HOLDINGS, INC.
Processing: BOGO-MEDELLIN MILLING COMPANY, INC.
Processing: BASIC ENERGY CORPORATION
Processing: CONCRETE AGGREGATES CORPORATION
Processing: CENTRAL AZUCARERA DE TARLAC, INC.
Processing: CONCREAT HOLDINGS PHILIPPINES, INC.
Processing: CONCEPCION INDUSTRIAL CORPORATION
Processing: CENTURY PACIFIC FOOD, INC.
Processing: COSCO CAPITAL, INC.
Processing: C

In [None]:
# Load company URLs
df = pd.read_excel("company_urls.xlsx")  # columns: Company Name, Company ID, Financial URL
results = []

# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

# Scrape quarterly financial reports
for _, row in df.iterrows():
    company_name = row["Company Name"]
    url = row["Financial URL"]
    print(f"Processing: {company_name}")
    driver.get(url)

    try:
        wait.until(EC.presence_of_element_located((By.XPATH, "//table[@class='view']")))
        quarterly_header = driver.find_element(By.XPATH, "//h3[text()='Quarterly']")
    except:
        print(f"No Quarterly data for {company_name}")
        continue

    # Extract fiscal period and currency
    try:
        header_text = quarterly_header.find_element(By.XPATH, "following-sibling::p[1]").text.replace("\n", " ")
        fiscal_period = header_text.split("ended :")[1].split("Currency")[0].strip()
        currency = header_text.split("Currency")[1].replace("(and units, if applicable)", "").replace(":", "").strip()
    except:
        fiscal_period = currency = "N/A"

    # Scrape tables following the Quarterly header
    tables = quarterly_header.find_elements(By.XPATH, "following-sibling::table[preceding-sibling::h3[1][text()='Quarterly']]")

    for tbl in tables:
        caption = tbl.find_element(By.TAG_NAME, "caption").text.strip()
        rows = tbl.find_elements(By.XPATH, ".//tbody/tr")[1:]  # skip header row

        for r in rows:
            th = r.find_elements(By.TAG_NAME, "th")
            cols = r.find_elements(By.TAG_NAME, "td")
            if not th or len(cols) not in [2, 4]:
                continue

            item = th[0].text.strip() or "N/A"
            col_texts = [c.text.strip() or "N/A" for c in cols]

            # 4-column Quarterly Income Statement
            if "income" in caption.lower() and len(col_texts) == 4:
                results.append({
                    "Company Name": company_name,
                    "Report Type": "Quarterly",
                    "Statement Type": caption,
                    "Fiscal Period": fiscal_period,
                    "Currency": currency,
                    "Item": item,
                    "Current 3 Months": col_texts[0],
                    "Previous 3 Months": col_texts[1],
                    "Current YTD": col_texts[2],
                    "Previous YTD": col_texts[3]
                })
            # 2-column Quarterly tables (e.g., balance sheet)
            elif len(col_texts) == 2:
                results.append({
                    "Company Name": company_name,
                    "Report Type": "Quarterly",
                    "Statement Type": caption,
                    "Fiscal Period": fiscal_period,
                    "Currency": currency,
                    "Item": item,
                    "Current Period": col_texts[0],
                    "Previous Period": col_texts[1]
                })

driver.quit()

# Save results
pd.DataFrame(results).to_excel("Quarterly Financial Reports.xlsx", index=False)