In [36]:
from bs4 import BeautifulSoup as bs

# Библиотека для работы с HTTP-запросами
import requests

# Модуль Re для регулярных выражений в Python
import re

# Пакет для работы с данными в формате json
import json

# Библиотека для анализа данных, представляющая данные в табличном виде называемом DataFrame
import pandas as pd

In [37]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.135 YaBrowser/21.6.3.757 Yowser/2.5 Safari/537.36'}

In [47]:
def _parser_hh(vacancy, cnt_page):

    vacancy_date = []
    params = {
        'text': vacancy, \
        'search_field': 'name', \
        'items_on_page': '100', \
        'page': ''
    }
    
    link = 'https://hh.ru/search/vacancy'
       
    html = requests.get(link, params=params, headers=headers)
    
    if html.ok:
        parsed_html = bs(html.text,'html.parser')
        
    for page in range(cnt_page):
        params['page'] = page
        html = requests.get(link, params=params, headers=headers)
        
        if html.ok:
            parsed_html = bs(html.text,'html.parser')
            
            vacancy_items = parsed_html.find('div', {'data-qa': 'vacancy-serp__results'}).find_all('div', {'class': 'vacancy-serp-item'})
                
            for element in vacancy_items:
                vacancy_date.append(hh_get_vacancy_info(element))
                
    return vacancy_date

def hh_get_vacancy_info(element):
    vacancy_date = {}
    # vacancy_name
    vacancy_name = element.find(
        'a', {
            'data-qa': 'vacancy-serp__vacancy-title'}).getText().replace(u'\xa0', u' ')
    vacancy_date['vacancy_name'] = vacancy_name
      
    # company_name
    company_name = element.find('div', {'class': 'vacancy-serp-item__meta-info-company'}).find('a').getText()
    vacancy_date['company_name'] = company_name
    
    # city
    city = element.find(
        'span', {
            'data-qa': 'vacancy-serp__vacancy-address'}).getText().split(', ')[0]
    vacancy_date['city'] = city
    
    # metro station
    metro_station = element.find('span', {'class': 'vacancy-serp-item__meta-info'}).findChild()
    if not metro_station:
        metro_station = None
    else:
        metro_station = metro_station.getText()

    vacancy_date['metro_station'] = metro_station
    
    # salary
    salary = element.find(
        'span', {
            'data-qa': 'vacancy-serp__vacancy-compensation'})
    salary_currency = None
    if not salary:
        salary_min = None
        salary_max = None
        salary_currency = None
    else:
        salary = salary.getText().replace(u'\u202f', u'')
        salary = re.split(r'\s|-', salary)
        if salary[0] == 'до':
            salary_min = None
            salary_max = int(salary[1])
            salary_currency = str(salary[2])
        elif salary[0] == 'от':
            salary_min = int(salary[1])
            salary_max = None
        else:
            salary_min = int(salary[0])
            salary_max = int(salary[2])
            salary_currency = str(salary[3])
    vacancy_date['salary_min'] = salary_min
    vacancy_date['salary_max'] = salary_max
    vacancy_date['salary_currency'] = salary_currency
    
    # vacancy_link
    vacancy_link = element.find(
        'a', {'data-qa': 'vacancy-serp__vacancy-title'}).get('href').split('?')[0]
    vacancy_date['vacancy_link'] = vacancy_link
       
    # site
    vacancy_date['site'] = 'hh.ru'

    return vacancy_date    

def _parser_superjob(vacancy, cnt_page):
    vacancy_date = []
    
    params = {
        'keywords': vacancy, \
        'profession_only': '1', \
        'geo[c][0]': '15', \
        'geo[c][1]': '1', \
        'geo[c][2]': '9', \
        'page': ''
    }
    
    link = 'https://www.superjob.ru/vacancy/search/'
       
    html = requests.get(link, params=params, headers=headers)
    
    for page in range(cnt_page):
        params['page'] = page
        html = requests.get(link, params=params, headers=headers)
        
        if html.ok:
            parsed_html = bs(html.text,'html.parser')
            vacancy_items = parsed_html.find_all('div', {'class': 'f-test-vacancy-item'})
                        
            for item in vacancy_items:
                vacancy_date.append(_parser_item_superjob(item))
                
    return vacancy_date

def _parser_item_superjob(item):
    

    vacancy_date = {}
    
    # vacancy_name
    vacancy_name = item.find('a').getText()
    vacancy_date['vacancy_name'] = vacancy_name
    
    # company_name
    company_name = item.find('span', {'class': 'f-test-text-vacancy-item-company-name'})
    
    if not company_name:
        company_name = item.findParent() \
                            .find('span', {'class': 'f-test-text-vacancy-item-company-name'}) \
                            .getText()
    else:
        company_name = company_name.getText()
    
    vacancy_date['company_name'] = company_name
    
    # city
    company_location = item.find('span', {'class': 'f-test-text-company-item-location'}) \
                            .findChildren()[2] \
                            .getText() \
                            .split(',')
    
    vacancy_date['city'] = company_location[0]
    
    #metro station
    if len(company_location) > 1:
        metro_station = company_location[1]
    else:
        metro_station = None
    
    vacancy_date['metro_station'] = metro_station
    
    #salary
    salary = item.find('span', {'class': 'f-test-text-company-item-salary'}).getText()

    if not salary or salary == 'По договорённости':
        salary_min = None
        salary_max = None
        salary_currency = None

    else:

        salary_split = salary.replace(u'\xa0', u' ').split(' ')
        salary_currency = salary_split[-1]

        if salary_split[0] == 'до':
            salary_min = None
            salary_max = int(salary_split[1] + salary_split[2])
        elif salary_split[0] == 'от':
            salary_min = int(salary_split[1] + salary_split[2])
            salary_max = None
        else:
            salary_min = int(salary_split[0] + salary_split[1])
            salary_max = int(salary_split[-3] + salary_split[-2])

    vacancy_date['salary_min'] = salary_min
    vacancy_date['salary_max'] = salary_max
    vacancy_date['salary_currency'] = salary_currency
    
    
    # link
    vacancy_link = item.find_all('a')
    
    if len(vacancy_link) > 1:
        vacancy_link = vacancy_link[-2]['href']
    else:
        vacancy_link = vacancy_link[0]['href']
    
    vacancy_date['vacancy_link'] = f'https://www.superjob.ru{vacancy_link }'
    
    # site
    vacancy_date['site'] = 'www.superjob.ru'
    
    return vacancy_date

In [51]:
def parser_vacancy(vacancy):
        
    vacancy_date = []
    vacancy_date.extend(_parser_hh(vacancy, cnt_page)) 
    vacancy_date.extend(_parser_superjob(vacancy, cnt_page))
    
    df = pd.DataFrame(vacancy_date)
    with open('vacancy_date.json', 'w') as f:
        json.dump(vacancy_date, f)
    
    return df

cnt_page = 2
vacancy = 'Data scientist'
df = parser_vacancy(vacancy)

df

Unnamed: 0,vacancy_name,company_name,city,metro_station,salary_min,salary_max,salary_currency,vacancy_link,site
0,Data Scientist (Отдел планирования и развития),Сбер для экспертов,Екатеринбург,Геологическая,,,,https://hh.ru/vacancy/47313004,hh.ru
1,Data Scientist,Сбер для экспертов,Екатеринбург,,,,,https://hh.ru/vacancy/47377173,hh.ru
2,Data Scientist,B2Broker Москва,Москва,Выставочная,,,,https://hh.ru/vacancy/46079759,hh.ru
3,Data Scientist,LeverX International,Минск,,,,,https://hh.ru/vacancy/47567633,hh.ru
4,Middle Data Scientist / Computer Vision Engineer,ZennoLab,Москва,,130000.0,200000.0,руб.,https://hh.ru/vacancy/47622728,hh.ru
...,...,...,...,...,...,...,...,...,...
59,Data Scientist RecSys,Сбербанк России,Москва,,,,,https://www.superjob.ru/clients/sberbank-rossi...,www.superjob.ru
60,Senior Data Scientist (Auto ML),Сбербанк России,Москва,,,,,https://www.superjob.ru/clients/sberbank-rossi...,www.superjob.ru
61,Data Scientist,Сбербанк России,Екатеринбург,,,,,https://www.superjob.ru/clients/sberbank-rossi...,www.superjob.ru
62,Data Scientist (Отдел планирования и развития),Сбербанк России,Екатеринбург,,,,,https://www.superjob.ru/clients/sberbank-rossi...,www.superjob.ru
