### Необходимо собрать информацию о вакансиях на вводимую должность (используем input или через аргументы получаем должность) с сайтов HH(обязательно) и/или Superjob(по желанию). Приложение должно анализировать несколько страниц сайта (также вводим через input или аргументы). Получившийся список должен содержать в себе минимум:
- Наименование вакансии.
- Предлагаемую зарплату (разносим в три поля: минимальная и максимальная и валюта. цифры преобразуем к цифрам).
- Ссылку на саму вакансию.
- Сайт, откуда собрана вакансия. (можно прописать статично hh.ru или superjob.ru)

#### Import all the necessary libs:

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlencode
from datetime import datetime
import pymongo

#### Job title constant for hh.ru scraping (or can be arg in .py script):

In [18]:
job_title = "Data engineer" 

#### Function that returns an appropriate url link for scraping:

In [19]:
def prepare_url(job_title, page=0):
    #job_title = "Data engineer"
    url = "https://hh.ru/search/vacancy"
    params = {
    "text": job_title,
    "from":"suggest_post",
    "items_on_page":20,
    "clusters":"true",
    "ored_clusters":"true",
    "enable_snippets":"true",
    "page":page}
    params_encoded = urlencode(params)
    return f"{url}?{params_encoded}"

    

#### Function that returns a DOM object by going to a specific page using generic url link:

In [20]:
def get_dom_from_url(job_title, session, page = 0):
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    }
    response = session.get(prepare_url(job_title, page), headers=headers)
    return BeautifulSoup(response.text, "html.parser")


#### Function that returns an integer value of maximum pages for current "job_title" url search:

In [21]:
def get_search_max_pages(job_title, session):
    response = get_dom_from_url(job_title, session)
    return int(response.find("a",{"class":"bloko-button",
                              "data-qa":"pager-next" }).previous_sibling.find('a',{"data-qa":"pager-page"}).text)
    

#### Function that scraps through the vacancies on the page and writes it into the vacancy_list:

In [22]:
def scrap_page(vacancies, vacancy_list, collection):
    for vacancy in vacancies:
            vacancy_info = {}
            vacancy_data = vacancy.find("div", {"class":""})
            vacancy_titile = vacancy_data.find("a", {"data-qa":"vacancy-serp__vacancy-title"}).text
            vacancy_link = vacancy_data.find("a", {"data-qa":"vacancy-serp__vacancy-title"}).get("href")
            if vacancy_data.find("span", {"data-qa":"vacancy-serp__vacancy-compensation"}) == None: # if salary is None then hardcode None
                vacancy_salary = None
                salary_from = None
                salary_to = None
                salary_currency = None
            else:
                vacancy_salary = vacancy_data.find("span", {"data-qa":"vacancy-serp__vacancy-compensation"}).text
                salary_currency = vacancy_salary.split(" ")[-1].replace(".","")
                if ("от" in vacancy_salary):
                    salary_from = int(vacancy_salary.split(" ")[1].replace("\u202f", ""))
                    salary_to = None
                elif "до" in vacancy_salary:
                    salary_from = None
                    salary_to = int(vacancy_salary.split(" ")[1].replace("\u202f", ""))
                elif "–" in vacancy_salary:
                    salary_from = int(vacancy_salary.split(" ")[0].replace("\u202f", ""))
                    salary_to = int(vacancy_salary.split(" ")[2].replace("\u202f", ""))
                else:
                    salary_from = None
                    salary_to = None
            vacancy_info["vacancy_titile"] = vacancy_titile
            vacancy_info["salary_from"] = salary_from
            vacancy_info["salary_to"] = salary_to
            vacancy_info["salary_currency"] = salary_currency
            vacancy_info["vacancy_link"] = vacancy_link
            vacancy_info["site_from"] = "hh.ru"
            vacancy_list.append(vacancy_info)
            insert_new_items(collection, vacancy_info)
    

#### Function that iterates through the available pages and runs the "scrap_page()" function for each applicable url page:

In [23]:
def scrap_pages(job_title, session, vacancy_list, collection):
    global inserts 
    inserts = 0
    max_pages = get_search_max_pages(job_title, session)
    print(f"Starting scraping through pages for {job_title} position")
    for page in range(max_pages):
        print(f"Scraping page {page+1} out of {max_pages}...")
        page_dom = get_dom_from_url(job_title, session, page)
        vacancies = page_dom.find_all("div", {"class":"vacancy-serp-item-body__main-info"})
        scrap_page(vacancies, vacancy_list, collection)
    print(f"Scraping is finished, found total of {len(vacancy_list)} vacancies")
    print(f"Total of {inserts} new documents added to mongodb")
        
        
        

#### Function that writes all the scraped data throught pages into one .csv file:

In [24]:
def write_to_csv(job_title, vacancy_list):
    df = pd.DataFrame(vacancy_list)
    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"{current_time}_{job_title}_HH_Vacancies.csv"
    df.to_csv(filename, sep=';', float_format='%.2f', index=False)
    print(f'File "{filename}" successfully generated')

#### Function that opens a mangodb connection:

In [25]:
def connect_to_mangodb():
    try:
        client= pymongo.MongoClient('127.0.0.1',27017)
        db = client['HH_ru_vacancies'] #database
        vacancies = db.vacancies #collection
        vacancies.create_index([('vacancy_link', 1)], name = "unique_index", unique=True)
        return vacancies
    except pymongo.errors.ConnectionFailure as e:
        print(f"Mongodb connection has failed. {e}")


#### Function that writes new documents into the mongodb

In [26]:
def insert_new_items(collection, document):
    try: 
        collection.insert_one(document)
        global inserts
        inserts += 1
    except pymongo.errors.DuplicateKeyError:
        pass

#### Main function:

In [27]:
def main(job_title):
    vacancy_list = []
    collection = connect_to_mangodb()
    session = requests.Session()
    scrap_pages(job_title, session, vacancy_list, collection)
    write_to_csv(job_title, vacancy_list)

In [31]:
main(job_title)

Starting scraping through pages for Data engineer position
Scraping page 1 out of 78...
Scraping page 2 out of 78...
Scraping page 3 out of 78...
Scraping page 4 out of 78...
Scraping page 5 out of 78...
Scraping page 6 out of 78...
Scraping page 7 out of 78...
Scraping page 8 out of 78...
Scraping page 9 out of 78...
Scraping page 10 out of 78...
Scraping page 11 out of 78...
Scraping page 12 out of 78...
Scraping page 13 out of 78...
Scraping page 14 out of 78...
Scraping page 15 out of 78...
Scraping page 16 out of 78...
Scraping page 17 out of 78...
Scraping page 18 out of 78...
Scraping page 19 out of 78...
Scraping page 20 out of 78...
Scraping page 21 out of 78...
Scraping page 22 out of 78...
Scraping page 23 out of 78...
Scraping page 24 out of 78...
Scraping page 25 out of 78...
Scraping page 26 out of 78...
Scraping page 27 out of 78...
Scraping page 28 out of 78...
Scraping page 29 out of 78...
Scraping page 30 out of 78...
Scraping page 31 out of 78...
Scraping page 32 out

#### Написать функцию, которая производит поиск и выводит на экран вакансии с заработной платой больше введённой суммы (необходимо анализировать оба поля зарплаты). То есть цифра вводится одна, а запрос проверяет оба поля

In [39]:
client= pymongo.MongoClient('127.0.0.1',27017)
db = client['HH_ru_vacancies'] #database
vacancies = db.vacancies #collection
salary_filter = 100000 

documents =vacancies.find({"$or":[
                {"salary_from":{"$gt":salary_filter}},
                {"salary_to":{"$gt":salary_filter}}],
                  "salary_currency":"руб"
                  })

for i in documents:
    print(i)

{'_id': ObjectId('62d668c583f80a0cf6eacec3'), 'vacancy_titile': 'Data Engineer', 'salary_from': 160000, 'salary_to': None, 'salary_currency': 'руб', 'vacancy_link': 'https://hh.ru/vacancy/67039410?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=Data%20engineer', 'site_from': 'hh.ru'}
{'_id': ObjectId('62d668c783f80a0cf6eacedc'), 'vacancy_titile': 'Senior Data Engineer', 'salary_from': 250000, 'salary_to': None, 'salary_currency': 'руб', 'vacancy_link': 'https://hh.ru/vacancy/67255069?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=Data%20engineer', 'site_from': 'hh.ru'}
{'_id': ObjectId('62d668c883f80a0cf6eacee0'), 'vacancy_titile': 'Data Engineer / MLOps', 'salary_from': 300000, 'salary_to': 400000, 'salary_currency': 'руб', 'vacancy_link': 'https://hh.ru/vacancy/67369863?from=vacancy_search_list&hhtmFrom=vacancy_search_list&query=Data%20engineer', 'site_from': 'hh.ru'}
{'_id': ObjectId('62d668c883f80a0cf6eacee1'), 'vacancy_titile': 'Data Engineer / MLOps', 'sa