In [2]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import numpy as np

In [None]:
def get_ids_on_page(url:str):
  data = requests.get(url).json()
  if data.get('items'):
    return [el['id'] for el in data['items']]
  return []

def get_all_ids(text: str):
  """
    Получает все идентификаторы вакансий по заданному тексту поиска.

    Parameters:
    text (str): Текст для поиска вакансий.

    Returns:
    List[int]: Список всех идентификаторов вакансий.
  """
  i = 0
  url = f'https://api.hh.ru/vacancies?text={text}&search_field=name&per_page=100&page={i}'
  ids = []

  while data:= get_ids_on_page(url):
    ids.extend(data)
    i += 1
    url = f'https://api.hh.ru/vacancies?text={text}&search_field=name&per_page=100&page={i}'
  return ids

def get_dataset(ids: list):
  dataset = []

  for id in tqdm(ids):
    url = f"https://api.hh.ru/vacancies/{id}"
    req = requests.get(url)
    data = req.json()
    req.close()

    try:
      # Удаление HTML-тегов из описания вакансии
      description_cleaned = re.sub(r"<[^>]*>", '', data['description'])
      vacancy = [
                data['id'],
                data['name'],
                data['published_at'],
                data['alternate_url'],
                data['type']['name'],
                data['employer']['name'],
                data['department']['name'] if data['department'] is not None else None,
                data['area']['name'],
                data['experience']['name'],
                [dic['name'] for dic in data['key_skills']],
                data['schedule']['name'],
                data['employment']['name'],
                description_cleaned,
                data['salary']['from'] if data['salary'] is not None else None,
                data['salary']['to'] if data['salary'] is not None else None,
                data['salary']['currency'] if data['salary'] is not None else None,
            ]
    except Exception as e:
      print(f"Error processing vacancy ID {id}: {e}")
    else:
      dataset.append(vacancy)
      sleep(0.5) #соблюдаем лимит запросов

  # Преобразование списка вакансий в DataFrame
  columns = ['id', 'name', 'published_at', 'alternate_url', 'type', 'employer',
               'department', 'area', 'experience', 'key_skills', 'schedule',
               'employment', 'description', 'salary_from', 'salary_to', 'currency_salary']

  return pd.DataFrame(dataset, columns=columns)

In [None]:
#получим id вакансий сначала аналитиков, потом DS специалистов
id_list_analyst = get_all_ids('аналитик')
id_list_ds = np.array([])
for vac in ['Data Scientist', 'DS', 'Специалист по машинному обучению', 'ML engineer', 'ML']:
  ids = get_all_ids(vac)
  id_list_ds = np.append(id_list_ds, ids, axis=0)

In [None]:
df_analyst = get_dataset(id_list_analyst)

  1%|          | 24/2000 [00:26<36:44,  1.12s/it]

In [None]:
df_ds = get_dataset(id_list_ds)

In [None]:
df_ds.head()

In [None]:
#df_analyst.to_csv('analytics-data.csv', index=False)
#df_ds.to_csv('data-scientists.csv', index=False)