In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import json

In [2]:
base_url = "https://www.google.com/about/careers/applications/jobs/results"  

In [3]:
def get_vacancies(level: str, pages: int, tqdm_: bool=False) -> list:
  vacancies = []
  
  bar = tqdm(range(1, pages + 1), desc=f'Vacancies for {level}') if tqdm_ else range(1, pages + 1)
  for _ in bar:  
    data = requests.get(base_url, params={'page': 1, 'target_level': level}) 
    
    if data.status_code != 200:
      print(f"Error: {data.status_code} - {data.reason}") 
      continue
    
    soup = BeautifulSoup(data.text, 'html.parser')
    for i in soup.find('main').find_all('ul')[0]: 
      vacancies.append({
        'title': i.h3.text,
        'requirements': [t.text for t in i.find_all('li')][:-2], 
        'level': level,
        'place': [t.text for t in i.find_all('span')][3], 
      })
      vacancies[-1]['country'] = vacancies[-1]['place'].split(',')[-1] 
     
  return vacancies

In [4]:
all_vacancies = []  

for level in ['INTERN_AND_APPRENTICE', 'EARLY', 'MID', 'ADVANCED', 'DIRECTOR_PLUS']:
  all_vacancies += get_vacancies(level, 80, True)

Vacancies for INTERN_AND_APPRENTICE:   0%|          | 0/80 [00:00<?, ?it/s]

Vacancies for EARLY:   0%|          | 0/80 [00:00<?, ?it/s]

Vacancies for MID:   0%|          | 0/80 [00:00<?, ?it/s]

Vacancies for ADVANCED:   0%|          | 0/80 [00:00<?, ?it/s]

Vacancies for DIRECTOR_PLUS:   0%|          | 0/80 [00:00<?, ?it/s]

In [5]:
len(all_vacancies)

8000

In [6]:
json.dump(all_vacancies, open('google_vacancies.json', 'w'), indent=4, ensure_ascii=False)