In [3]:
from bs4 import BeautifulSoup
import requests
from typing import Dict, List
import re
import time
from urllib.parse import urljoin


In [7]:
class EAHNCompanyExtractor:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.base_url = "https://eahn.obio.ca"
        self.companies_url = f"{self.base_url}/companies/"

    def fetch_page(self, url: str, retries: int = 3, delay: int = 1) -> str:
        """Fetch HTML content from URL with retry mechanism."""
        for attempt in range(retries):
            try:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                return response.text
            except requests.RequestException as e:
                if attempt == retries - 1:
                    raise Exception(f"Failed to fetch {url} after {retries} attempts: {str(e)}")
                print(f"Attempt {attempt + 1} failed, retrying...")
                time.sleep(delay)
        return ""

    def extract_main_page_companies(self) -> List[Dict]:
        """Extract company information from the main companies page."""
        print(f"Fetching main company list from {self.companies_url}")
        html_content = self.fetch_page(self.companies_url)
        soup = BeautifulSoup(html_content, 'html.parser')
        companies = []
        
        # Find all company wrapper divs
        company_divs = soup.find_all('div', class_='vc_col-sm-3')
        print(f"Found {len(company_divs)} company entries")
        
        for div in company_divs:
            company_info = {}
            
            # Extract company link and logo
            company_link = div.find('a')
            if company_link:
                # Get the full URL
                relative_url = company_link.get('href', '')
                company_info['url'] = urljoin(self.base_url, relative_url)
                
                # Extract company name from URL
                company_name = re.search(r'/([^/]+)/$', company_info['url'])
                if company_name:
                    company_info['name'] = company_name.group(1).replace('-', ' ').title()
                
                # Extract logo URL
                logo_img = company_link.find('img')
                if logo_img:
                    logo_url = logo_img.get('src', '')
                    company_info['logo_url'] = urljoin(self.base_url, logo_url)
            
            if company_info:
                companies.append(company_info)
        
        return companies

    def extract_subpage_info(self, html_content: str) -> Dict:
        """Extract detailed information from company subpage."""
        soup = BeautifulSoup(html_content, 'html.parser')
        company_details = {}
        
        # Extract company title/heading
        title = soup.find('h2', class_='blue')
        if title:
            company_details['title'] = title.text.strip()
        
        # Extract main content sections
        content_sections = soup.find_all('div', class_='wpb_text_column')
        description_parts = []
        company_details['points'] = []
        
        for section in content_sections:
            # Extract paragraphs
            paragraphs = section.find_all('p')
            for p in paragraphs:
                text = p.text.strip()
                if text:
                    description_parts.append(text)
                    
                    # Look for website link
                    links = p.find_all('a')
                    for link in links:
                        if 'http' in link.get('href', ''):
                            company_details['website'] = link.get('href')
            
            # Extract lists
            lists = section.find_all(['ul', 'ol'])
            for lst in lists:
                for item in lst.find_all('li'):
                    company_details['points'].append(item.text.strip())
        
        company_details['description'] = '\n'.join(description_parts)
        return company_details

    def process_companies(self, limit: int = None) -> List[Dict]:
        """Process companies and fetch their subpage information.
        Args:
        limit (int, optional): Maximum number of companies to process. If None, process all.
    """
        companies = self.extract_main_page_companies()
        if limit:
            companies = companies[:limit]
        total_companies = len(companies)
        
        print(f"\nStarting to process {total_companies} companies...")
        
        for index, company in enumerate(companies, 1):
            try:
                print(f"\nProcessing {company['name']} ({index}/{total_companies})...")
                html_content = self.fetch_page(company['url'])
                subpage_info = self.extract_subpage_info(html_content)
                company.update(subpage_info)
                # Add small delay between requests
                time.sleep(1)
            except Exception as e:
                print(f"Error processing {company['name']}: {str(e)}")
        
        return companies

def export_to_csv(companies: List[Dict], output_file: str = 'eahn_companies.csv'):
    """Export company information to CSV file."""
    import csv
    
    # Define CSV headers
    headers = ['name', 'url', 'logo_url', 'title', 'description', 'website', 'points']
    
    with open(output_file, 'w', newline='', encoding='utf-8-sig') as csvfile:  # utf-8-sig for Excel compatibility
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        
        for company in companies:
            # Prepare row data
            row = {
                'name': company.get('name', ''),
                'url': company.get('url', ''),
                'logo_url': company.get('logo_url', ''),
                'title': company.get('title', ''),
                'description': company.get('description', ''),
                'website': company.get('website', ''),
                'points': ';'.join(company.get('points', []))
            }
            writer.writerow(row)

In [9]:
def main():
    try:
        print("Starting EAHN company information extraction (test run with 10 companies)...")
        extractor = EAHNCompanyExtractor()
        companies = extractor.process_companies(limit=54)
        
        # Export to CSV
        output_file = 'eahn_companies.csv'
        export_to_csv(companies, output_file)
        print(f"\nSuccessfully exported {len(companies)} companies to {output_file}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Starting EAHN company information extraction (test run with 10 companies)...
Fetching main company list from https://eahn.obio.ca/companies/


Found 110 company entries

Starting to process 54 companies...

Processing Alethea (1/54)...

Processing Bodiometer (2/54)...

Processing Nanotess (3/54)...

Processing Rostrum Medical Innovations (4/54)...

Processing Tochtech Technologies (5/54)...

Processing Zamplo (6/54)...

Processing A4I (7/54)...

Processing Able Innovations (8/54)...

Processing Ai Vali (9/54)...

Processing Amacathera (10/54)...

Processing Awake Labs (11/54)...

Processing Braze Mobility (12/54)...

Processing Cosm Medical (13/54)...

Processing Eapoc Evidence At The Point Of Care (14/54)...

Processing Enhanced Medical Nutrition (15/54)...

Processing Flosonics Medical (16/54)...

Processing Fluidai Medical (17/54)...

Processing Focal Healthcare (18/54)...

Processing Frontline (19/54)...

Processing Goji Technology Systems (20/54)...

Processing Gotcare (21/54)...

Processing Huron Digital Pathology (22/54)...

Processing Hyivy Health (23/54)...

Processing Hypercare (24/54)...

Processing Inventorr Md (2