## Preparation

In [23]:
import concurrent.futures
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

## Site: levels.fyi

In [22]:
def scrape_job_data(company_id: str, job_title_id: str, size: str):
    url = f'https://www.levels.fyi/company/{company_id}/salaries/{job_title_id}'
    driver = webdriver.Chrome()
    driver.get(url)
    print(f'--> crawling {company_id} {job_title_id}...')

    records = []

    try:
        button = driver.find_element(
            by=By.CSS_SELECTOR, value="button.MuiButton-root.MuiButton-text.MuiButton-textPrimary.MuiButton-sizeMedium.MuiButton-textSizeMedium.MuiButtonBase-root.css-um5318")
        button.click()

        button = driver.find_element(
            by=By.CSS_SELECTOR, value="button.MuiButton-root.MuiButton-text.MuiButton-textPrimary.MuiButton-sizeMedium.MuiButton-textSizeMedium.MuiButtonBase-root.css-g9gvkf")
        button.click()
    except:
        pass

    try:
        tbody = driver.find_elements(by=By.CSS_SELECTOR, value="tbody.MuiTableBody-root")[1]
        rows = tbody.find_elements(by=By.TAG_NAME, value="tr")

        for row in rows:
            try:
                cells = row.find_elements(by=By.TAG_NAME, value="td")

                location = cells[0].find_element(by=By.TAG_NAME, value="span").find_element(by=By.TAG_NAME, value="span").text.split('|')[0].strip()
                level = cells[1].find_element(by=By.TAG_NAME, value="p").text
                field = cells[1].find_element(by=By.TAG_NAME, value="span").text
                yoe_total = cells[2].find_element(by=By.TAG_NAME, value="p").text
                yoe_at_company = cells[2].find_element(by=By.TAG_NAME, value="span").text
                total_compensation = cells[3].find_element(by=By.TAG_NAME, value="p").text
                compensation_details = cells[3].find_element(by=By.TAG_NAME, value="span").text.split('|')
                base = compensation_details[0].strip()
                stock = compensation_details[1].strip()
                bonus = compensation_details[2].strip()

                new_record = {
                    'company': company_id.replace('-', ' ').title(),
                    'company_size': size,
                    'job_title': job_title_id.replace('-', ' ').title(),
                    'level': level,
                    'domain': field,
                    'yoe_total': yoe_total,
                    'yoe_at_company': yoe_at_company,
                    'base': base,
                    'stock': stock,
                    'bonus': bonus,
                    'total_compensation': total_compensation,
                    'location': location
                }

                records.append(new_record)
            except:
                continue
    except:
        pass

    driver.close()
    return pd.DataFrame(records)


def crawl_company(company_id: str):
    page = requests.get(f'https://www.levels.fyi/companies/{company_id}')
    soup = BeautifulSoup(page.content, 'html.parser')
    size = soup.findAll(
        'h6', class_='MuiTypography-root MuiTypography-subtitle1 css-idrr7q')[1].text

    page = requests.get(
        f'https://www.levels.fyi/companies/{company_id}/salaries')
    soup = BeautifulSoup(page.content, 'html.parser')
    job_titles_container = soup.find_all(
        'h6', class_='MuiTypography-root MuiTypography-h6 css-jv9qtm')
    job_titles = [job_title.text for job_title in job_titles_container]
    job_title_ids = [job_title.strip().lower().replace(' ', '-') for job_title in job_titles]
    
    threads = []
    scraped_dfs = []

    # Use ThreadPoolExecutor to run the scraping function in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for job_title_id in job_title_ids:
            # Pass df, company_id, job_title_id, and size as arguments to the scraping function
            thread = executor.submit(scrape_job_data, company_id, job_title_id, size)
            threads.append(thread)

    # Wait for all threads to finish
    concurrent.futures.wait(threads)

    # Retrieve the results from the threads
    for thread in threads:
        try:
            df_result = thread.result()
            scraped_dfs.append(df_result)
        except Exception as e:
            print(f"Error retrieving result from thread: {e}")

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(scraped_dfs, ignore_index=True)

    return result_df

### Popular companies (but they are also the top-paying to me)

In [12]:

page = requests.get('https://www.levels.fyi/companies')
soup = BeautifulSoup(page.content, 'html.parser')
popular_companies_container = soup.find_all(
    'h6', class_='MuiTypography-root MuiTypography-h6 css-1v6gvkr')
popular_companies = [company.text for company in popular_companies_container]
popular_companies

['Google',
 'Amazon',
 'Apple',
 'Lyft',
 'Facebook',
 'Microsoft',
 'Uber',
 'Stripe',
 'Roblox',
 'Coinbase',
 'Databricks',
 'Snap',
 'Netflix',
 'LinkedIn',
 'Salesforce',
 'Hudson River Trading',
 'Jane Street',
 'Citadel',
 'Two Sigma',
 'JPMorgan Chase',
 'Capital One',
 'Oracle',
 'Bytedance',
 'Intel']

### We'd like to put the desired companies here

Crawled companies:
- Logitech
- Microsoft
- Netflix
- Uber
- Visa
- Google
- Apple
- JPMorgan Chase
- Shopee
- Tiki
- Grab
- Gojek Tech
- Riot Games
- LinkedIn
- Intel
- MongoDB
- Roblox
- Oracle
- Stripe
- Facebook
- Amazon
- Snap
- Axon
- AMD

In [None]:
# make sure to check if the url exists: https://www.levels.fyi/company/{company-name}
desired_companies = ['company-name']

desired_companies_df = pd.DataFrame()

for desired_company in desired_companies:
	desired_company_df = crawl_company(desired_company)
	desired_companies_df = pd.concat([desired_companies_df, desired_company_df], ignore_index=True)

desired_companies_df.to_csv('./data/desired_companies.csv', index=False)

desired_companies_df

### Observation