In [None]:
#install selenium
pip install selenium

In [13]:
#import libraries
from bs4 import BeautifulSoup
import requests
import time
import re
import json
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from tqdm import tqdm

In [None]:
#setting up headless chrome webdriver
def make_driver():
    options=webdriver.ChromeOptions()
    options.add_argument("--headless")
    return webdriver.Chrome(options=options)

driver=make_driver()
url="https://www.ycombinator.com/companies"
#opens the specified URL's in the chrome webdriver
def get_url():
    driver.get(url)
#function to click the see all options button in batch
def click_see_all_options():
    time.sleep(5)
    see_all_options= driver.find_element(By.CSS_SELECTOR, 'a._showMoreLess_99gj3_241')
    see_all_options.click()
#function to filter checkboxes representing batches
def compile_batches():
    pattern=re.compile(r'^(|W|S|IK)[012]')
    filter_checkbox=driver.find_elements(By.XPATH,'//label')
    for checkbox in filter_checkbox:
        if pattern.match(checkbox.text):
            yield checkbox
#function to scroll down the webpage until it reaches the bottom
def scroll_to_bottom(driver):
    last_height=driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
        time.sleep(5)
        new_height=driver.execute_script("return document.body.scrollHeight")
        if new_height==last_height:
            break
        last_height=new_height
#function to extract company URL's from webpage        
def fetch_url_paths():
    elements = driver.find_elements(
        By.XPATH, ('//a[contains(@href,"/companies/") and not(contains(@href,"founders"))]'))
    for url in elements:
        yield url.get_attribute('href')
#function to write extracted url's to a file
def write_urls_to_file(url_list):
    with open('company_urls.txt', 'w') as f:
        for url in url_list:
            f.write(url + '\n')
#function to read url's from the file
def read_urls_from_file():
    url_list = [] 
    with open('company_urls.txt', 'r') as f:
        for line in f:
            url_list.append(line.strip())
    return url_list   
#main function to extract company url's and save company url's
def yc_links_extractor():
    try:
        get_url()
        click_see_all_options()
        batches=compile_batches()
        all_company_urls=[]
    
        for b in tqdm(list(batches)):
            b.click()
            scroll_to_bottom(driver)
            urls = fetch_url_paths()
            all_company_urls.extend(urls)
            b.click()
        write_urls_to_file(all_company_urls)
        print(f"Total company URLs extracted: {len(all_company_urls)}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

if __name__=='__main__':
    yc_links_extractor()

In [15]:
#function to scrape company website and extract company details
def scrape_company_website(url):
    try:
        response=requests.get(url)   #send an ghttp get request to the url
        response.raise_for_status()  #raise an exception for unsuccessful response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while requesting {url}: {e}")  # prints error message if request fails
        return None
    #parse the html content using BeautifulSoup
    if response.status_code==200:
        soup=BeautifulSoup(response.content,'html.parser')
        #extract company name
        company_name=soup.find('h1',class_="font-extralight").text.strip()
        #extract tagline
        tagline=soup.find('h3',class_="sm:block md:hidden").text.strip()
        #extract description of the company
        description=[]
        description_tags=soup.find('p',class_="whitespace-pre-line")
        if description_tags:
            descriptions=description_tags.get_text(separator="",strip=True)
            description=descriptions.split('\n')[0]  #gets the first line
        else:
            print("description not found")
        #extracts batch, industry tags and company type
        batches= soup.find('a', href=lambda href: href and 'batch'in href)
        batch=batches.find('span').text.strip() if batches else None
        industry_tags = [a.text.strip() for a in soup.find_all('a', href=lambda href: href and 'industry' in href)]
        company_type = soup.find('div', class_='flex flex-row items-center justify-between').text.strip()
        #extract location,website link,founded year and team size
        location=soup.find('span',string="Location:")
        locations=location.find_next_sibling('span').text.strip()  (#text after "Location:" span)
        
        website_link=soup.find('div',class_="inline-block group-hover:underline")
        website=website_link.text.strip() if website_link else None

        founded=soup.find('span',string="Founded:")
        founded_year=founded.find_next_sibling('span').text.strip()   (#text after "Founded:" span)
        
        team_size=soup.find('span',string="Team Size:")
        team_size_value=team_size.find_next_sibling('span').text.strip()   (#text after "Team Size:" span)
        #extract social media profiles of company
        social_media=soup.find('div',class_="space-x-2")
        social_media_url={}
        if social_media:
            links=social_media.find_all('a')
            for link in links:
                if 'linkedin' in link['href']:
                    social_media_url['linkedin']=link['href']
                elif 'twitter' in link['href']:
                    social_media_url['twitter']=link['href']
                elif 'facebook' in link['href']:
                    social_media_url['facebook']=link['href']
                elif 'crunchbase' in link['href']:
                    social_media_url['crunchbase']=link['href']
        #extracts founder name
        founder_name=[]
        founder_tags=soup.find('h3',class_="text-lg font-bold")
        if founder_tags:
            tags=founder_tags.text.strip()
            name=tags.split(',',1)[0] if ',' in tags else tags.strip()
            founder_name.append(name)
        #extracts founder biography
        biography=[]
        biographies=soup.find('p',class_="prose max-w-full whitespace-pre-line")
        if biographies:
            biography=[bio.text.strip().split('\n')[:1] for bio in biographies]  #extracts first line from biography
        else:
            print("bio not found")
        #extracts social media profiles of founder   
        social_media_profile=soup.find('div',class_="mt-1 space-x-2")
        social_media_profile_data={}
        if social_media_profile:
            links=social_media_profile.find_all('a')
            for link in links:
                if 'linkedin' in link['href']:
                    social_media_profile_data['linkedin']=link['href']
                elif 'twitter' in link['href']:
                    social_media_profile_data['twitter']=link['href']
        #  create dictionaries for founder information and company data
        founders_info=[]
        founder_data = {
                    "Name":founder_name,
                    "Biography": biography,
                    "LinkedIn Profile": social_media_profile_data.get('linkedin'),
                    "Twitter Profile": social_media_profile_data.get('twitter')
                }
        founders_info.append(founder_data)
                    
        company_data={
           "company":{
              "Company Name": company_name,
              "Tagline": tagline,
              "Description":description,
              "batch":batch,
              "company type":company_type,
              "industry tags":industry_tags,
              "Location": locations,
              "website":website,
              "Founded": founded_year,
              "Team Size": team_size_value,
              "social media profiles":{
                    "LinkedIn": social_media_url.get('linkedin'),
                    "Twitter": social_media_url.get('twitter'),
                    "Facebook": social_media_url.get('facebook'),
                    "Crunchbase": social_media_url.get('crunchbase')
              }
           },
            "founders":founders_info
        
    }  
        return company_data  #return the dictionary containing all the scraped company data
#function to save data to json file   
def save_to_json(company_data,filename):
    with open(filename, 'a') as f:
        json.dump(company_data,f, indent=4)
        f.write('\n')
            
if __name__ == '__main__':
    all_company_urls=read_urls_from_file()  #read company urls from file
    total_urls = len(all_company_urls)
    with tqdm(total=total_urls,desc="scraping")as pbar: #progress bar for tracking completion
        for url in all_company_urls:
            company_data=scrape_company_website(url)
            save_to_json(company_data,"company_data.json")  #save all scraped data to json file
            pbar.update(1)
       
    #prints confirmation message        
    print("Scraping completed. Data saved to 'company_data.json'.")

SyntaxError: invalid syntax. Perhaps you forgot a comma? (201942027.py, line 33)