In [1]:
# Import the libraries and modules
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

In [None]:
# Base URL and headers
BASE_URL = "https://www.cityu.edu.hk/hktech300/start-ups/all-start-ups?page="

# Chrome options
OPTIONS = Options()
OPTIONS.add_argument("--start-maximized") # Added for visual monitoring

# Path to ChromeDriver
SERVICE = Service("/opt/homebrew/bin/chromedriver")

In [None]:
# Storage List for startup company data
startup_data = []

# Total pages to scrape
total_pages = 42

### Get Details - Company Name and CityU URL

In [None]:
# Initialize WebDriver
driver = webdriver.Chrome(service=SERVICE, options=OPTIONS)

try:
    for page in range(total_pages + 1):
        driver.get(f"{BASE_URL}{page}")

        try:
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card.fund.team")))
        except:
            print(f'"Timeout: Cards not loaded for Page {page}. Skipping..."')
            continue

        cards = driver.find_elements(By.CSS_SELECTOR, ".card.fund.team")

        for card in cards:
            try:
                links = card.find_elements(By.TAG_NAME, "a")
                if len(links) >= 2:
                    company_name = links[1].text.strip()
                    company_url = links[1].get_attribute("href").strip()
                    startup_data.append({
                        "Company Name": company_name,
                        "CityU URL": company_url
                    })
                else:
                    print("Incomplete card: <a> tags missing")
            except Exception as e:
                print(f"Error parsing card: {e}")

        time.sleep(1)

finally:
    driver.quit()

In [None]:
# Create a DataFrame from the collected data in a list
df = pd.DataFrame(startup_data)

In [6]:
# Save to Excel file
excel_file = "hktech300_startups.xlsx"
df.to_excel(excel_file, index=False)
print(f"{len(df)} startups saved to {excel_file}")

756 startups saved to hktech300_startups.xlsx


### Get Details - Company Website and Email

In [None]:
# Read Excel
df = pd.read_excel(excel_file)

# Add columns if not already present
if "Company Website" not in df.columns:
    df["Company Website"] = ""
if "Email" not in df.columns:
    df["Email"] = ""

In [8]:
driver = webdriver.Chrome(service=SERVICE, options=OPTIONS)

# Visit each CityU URL and extract details
for index, row in df.iterrows():
    cityu_url = row["CityU URL"]

    try:
        driver.get(cityu_url)
        time.sleep(1.5)  # Allow page to load

        # Extract Company Website
        try:
            website_elem = driver.find_element(By.CSS_SELECTOR, ".field--name-field-client a")
            website = website_elem.get_attribute("href").strip()
            df.at[index, "Company Website"] = website
        except:
            df.at[index, "Company Website"] = ""

        # Extract Email
        try:
            email_elem = driver.find_element(By.CSS_SELECTOR, ".field--name-field-team-members-email a")
            email = email_elem.get_attribute("href").replace("mailto:", "").strip()
            df.at[index, "Email"] = email
        except:
            df.at[index, "Email"] = ""

    except Exception as e:
        print(f"Error for {row['Company Name']}: {e}")

    time.sleep(1)

driver.quit()

In [9]:
# Save updated data back to same Excel file
df.to_excel("hktech300_startups.xlsx", index=False)
print("Additional Data saved to hktech300_startups.xlsx")

Additional Data saved to hktech300_startups.xlsx
