## Preparation

In [22]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

In [24]:
df = pd.DataFrame(columns=[
	'company',
	'company_size',
	'job_title',
	'level',
	'domain',
	'yoe_total',
	'yoe_at_company',
	'total_compensation',
	'location'
])

## Site: levels.fyi

In [17]:
def crawl_company(df: pd.DataFrame, company_id: str):
	driver = webdriver.Chrome()
	
	page = requests.get(f'https://www.levels.fyi/companies/{company_id}')
	soup = BeautifulSoup(page.content, 'html.parser')
	size = soup.findAll(
		'h6', class_='MuiTypography-root MuiTypography-subtitle1 css-idrr7q')[1].text

	page = requests.get(f'https://www.levels.fyi/companies/{company_id}/salaries')
	soup = BeautifulSoup(page.content, 'html.parser')
	job_titles_container = soup.find_all('h6', class_='MuiTypography-root MuiTypography-h6 css-jv9qtm')
	job_titles = [job_title.text for job_title in job_titles_container]
	job_title_ids = [job_title.strip().lower().replace(' ', '-') for job_title in job_titles]

	for job_title_id in job_title_ids:
		driver.get(f'https://www.levels.fyi/company/{company_id}/salaries/{job_title_id}')

		print(f'--> crawling {company_id} {job_title_id}...')

		try:
			button = driver.find_element(
				by=By.CSS_SELECTOR, value="button.MuiButton-root.MuiButton-text.MuiButton-textPrimary.MuiButton-sizeMedium.MuiButton-textSizeMedium.MuiButtonBase-root.css-um5318")
			button.click()

			button = driver.find_element(
				by=By.CSS_SELECTOR, value="button.MuiButton-root.MuiButton-text.MuiButton-textPrimary.MuiButton-sizeMedium.MuiButton-textSizeMedium.MuiButtonBase-root.css-g9gvkf")
			button.click()
		except:
			pass

		try:
			tbody = driver.find_element(
				by=By.CSS_SELECTOR, value="tbody.MuiTableBody-root.css-6l3v2e"
			)
			rows = tbody.find_elements(
				by=By.TAG_NAME, value="tr"
			)
			for row in rows:
				try:
					cells = row.find_elements(
						by=By.TAG_NAME, value="td"
					)
					location = cells[0].find_element(
						by=By.TAG_NAME, value="span"
					).find_element(
						by=By.TAG_NAME, value="span"
					).text.split('|')[0]
					level = cells[1].find_element(
						by=By.TAG_NAME, value="p"
					).text
					field = cells[1].find_element(
						by=By.TAG_NAME, value="span"
					).text
					yoe_total = cells[2].find_element(
						by=By.TAG_NAME, value="p"
					).text
					yoe_at_company = cells[2].find_element(
						by=By.TAG_NAME, value="span"
					).text
					total_compensation = cells[3].find_element(
						by=By.TAG_NAME, value="p"
					).text

					new_record = {
						'company': company_id,
						'company_size': size,
						'job_title': job_title_id,
						'level': level,
						'domain': field,
						'yoe_total': yoe_total,
						'yoe_at_company': yoe_at_company,
						'total_compensation': total_compensation,
						'location': location
					}

					print(new_record)

					df.loc[len(df)] = new_record
				except:
					continue
		except:
			pass
	driver.close()

### Popular companies (but they are also the top-paying to me)

In [None]:

page = requests.get('https://www.levels.fyi/companies')
soup = BeautifulSoup(page.content, 'html.parser')
popular_companies_container = soup.find_all(
    'h6', class_='MuiTypography-root MuiTypography-h6 css-1v6gvkr')
popular_companies = [company.text for company in popular_companies_container]
popular_companies

popular_companies_df = pd.DataFrame(columns=[
	'company',
	'company_size',
	'job_title',
	'level',
	'domain',
	'yoe_total',
	'yoe_at_company',
	'total_compensation',
	'location'
])

for popular_company in popular_companies:
	crawl_company(popular_companies_df, popular_company)

popular_companies_df.to_csv('popular_companies.csv', index=False)

### We'd like to put the desired companies here

In [None]:
desired_companies = [
	'shopee',
	'tiki',
	'grab',
	'gojek-tech',
	'kms-technology',
	'riot-games',
]

desired_companies_df = pd.DataFrame(columns=[
	'company',
	'company_size',
	'job_title',
	'level',
	'domain',
	'yoe_total',
	'yoe_at_company',
	'total_compensation',
	'location'
])

for desired_company in desired_companies:
	crawl_company(desired_companies_df, desired_company)

desired_companies_df.to_csv('desired_companies.csv', index=False)

In [32]:
df = pd.concat([popular_companies_df, desired_companies_df])
df.to_csv('all_companies.csv', index=False)