In [6]:
!python -m pip install bs4 selenium webdriver_manager

Collecting webdriver_manager
  Obtaining dependency information for webdriver_manager from https://files.pythonhosted.org/packages/b1/51/b5c11cf739ac4eecde611794a0ec9df420d0239d51e73bc19eb44f02b48b/webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import csv
import os
import importlib  
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import get_output_paths
    
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument('--headless')

def fill_job_template(url=None, company_name=None, job_title=None, salary=None, location=None, description=None, views=None, responses=None, company_type=None, employment=None, experience=None, english=None):
    return {
    "URL" : url if url else "",
    "Company" : company_name if company_name else "",
    "Job" : job_title if job_title else "", 
    "Salary" : salary.text.strip().split(" ")[-1] if salary else "",
    "Location" : location if location else "",
    "Views" : views if views else "",
    "Responses" : responses if responses else "",
    "Company type" : company_type.text.strip() if company_type else "",
    "Employment" : employment.text.strip() if employment else "",
    "Experience" : experience.text.strip() if experience else "",
    "English" : english.text.strip() if english else "",
    "Description" : description if description else "",
  }

def make_output_path(filename:str) -> str:
    output_path, output_dir = get_output_paths(filename)
    os.makedirs(output_dir, exist_ok=True)
    file = open(output_path, "w")
    csv_writer = csv.DictWriter(file, fieldnames=fill_job_template())
    csv_writer.writeheader()
    file.close()
    return output_path






In [2]:
from bs4 import BeautifulSoup

def parse_html(html, url):
  soup = BeautifulSoup(html, 'html.parser')
  company_name = soup.select_one('.job-details--title').text.strip()
  job_title = soup.select_one('div.detail--title-wrapper h1').get_text(strip=True, separator=';').split(';')[0]
  salary = soup.select_one('div.detail--title-wrapper h1 span')
  location = soup.select_one('div.job-additional-info--item-text span.location-text').text.strip().replace("\n", " ")
  description = soup.select_one('div.mb-4').text.strip()
  views = int(soup.select_one('span.bi.bi-eye').next_sibling.strip().split()[0])
  responses = int(soup.select_one('span.bi.bi-people-fill').next_sibling.strip().split()[0])

  company_type = soup.select_one('li.job-additional-info--item:contains("Категорія") span')
  employment = soup.select_one('li.job-additional-info--item:contains("Office") div')
  experience = soup.select_one('li.job-additional-info--item:contains("5 років") div')
  english = soup.select_one('li.job-additional-info--item:contains("Upper-Intermediate") div')

  return fill_job_template(
    url[:-1],
    company_name,
    job_title,
    salary,
    location,
    description,
    views,
    responses,
    company_type,
    employment,
    experience,
    english
  )

def format_output(result: dict):
   return "\n".join([f"{k}:{v}" for k,v in result.items()]) + "\n\n"


In [3]:
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import JOBS_URL, URL
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import os

def write_job_links(output_path, job_links):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(job_links)

def write_job(output_path, job_info):
    with open(output_path, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=fill_job_template())
        csv_writer.writerow(job_info)

def get_pages_urls():
    response = requests.get(JOBS_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    last_li = soup.find('ul', class_='pagination pagination_with_numbers').find_all('li')[-2]
    number_of_pages = int(last_li.text.strip())
    urls = [f"{JOBS_URL}/?page={page}" for page in range(1, number_of_pages + 1)]
    return urls

def get_job_urls(url):
    print(f"processing", url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    job_links = soup.select('.job-list-item__link')
    return [f"{URL}{link['href']}" for link in job_links]


def get_job_urls_multi():
    pages = get_pages_urls()
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        job_urls = list(executor.map(get_job_urls, pages))
    job_urls = [url for sublist in job_urls for url in sublist]
    return job_urls






In [4]:
import concurrent.futures
import importlib  
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import OUTPUT_FILE_NAME
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def process_job_page(url, output_path):
    print(f"Processing {url}")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=chrome_options)
    try:
        driver.get(url)
        WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.TAG_NAME, "li")))
        output = parse_html(driver.page_source, url)
        write_job(output_path, output)
    except Exception as e:
        print("EXCEPTION", e)
        driver.quit()
        raise e

def write_job_links():
    with open(get_output_paths("jobs.txt")[0], "w") as file:
        job_urls = get_job_urls_multi()
        file.writelines([f"{url}\n" for url in job_urls])

def read_job_links():
    with open(get_output_paths("jobs.txt")[0], "r") as file:
        return file.readlines()

def main():
    output_path = make_output_path(OUTPUT_FILE_NAME)
    job_urls = read_job_links()
    # job_urls = job_urls[:limit] if limit else job_urls
    print(f"Processing {len(job_urls)} pages: {', '.join(job_urls[:3])}\n...")
    # process_job_page(job_urls[0], output_path)
    output_paths = [output_path] * len(job_urls)
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(process_job_page, job_urls, output_paths)
    print(output_path)





In [5]:
main()

Processing 7254 pages: https://djinni.co/jobs/606137-account-manager/
, https://djinni.co/jobs/606136-business-analyst-gambling-/
, https://djinni.co/jobs/606135-dizainep-dlia-baneriv-dlia-play-market/

...
Processing https://djinni.co/jobs/606137-account-manager/

Processing https://djinni.co/jobs/606136-business-analyst-gambling-/

Processing https://djinni.co/jobs/606135-dizainep-dlia-baneriv-dlia-play-market/

Processing https://djinni.co/jobs/583012-senior-seo-specialist-e-commerce-us-long-term/

Processing https://djinni.co/jobs/606127-product-owner-erp-crm/





Processing https://djinni.co/jobs/606133-devops-engineer/

Processing https://djinni.co/jobs/606132-technical-setup-specialist/

Processing https://djinni.co/jobs/606131-lead-full-stack-engineer-node-js-react-js-ast/

Processing https://djinni.co/jobs/606130-full-stack-junior-middle-javascript-rozrobnik/

Processing https://djinni.co/jobs/595660-back-end-scala-developer/

Processing https://djinni.co/jobs/585393-sales-manager/

Processing https://djinni.co/jobs/606129-full-stack-developer-symfony-vue-js-/

Processing https://djinni.co/jobs/606126-android-developer/

Processing https://djinni.co/jobs/606123-node-developer/

Processing https://djinni.co/jobs/606122-senior-java-developer/

Processing https://djinni.co/jobs/606119-middle-senior-wordpress-developer/

Processing https://djinni.co/jobs/606118-copywriter-content-writer/

Processing https://djinni.co/jobs/606116-copywriter/

Processing https://djinni.co/jobs/490409-upworkleadgen-sdr/

Processing https://djinni.co/jobs/556704-py

KeyboardInterrupt: 

Processing https://djinni.co/jobs/551015-senior-affiliate-manager-for-clickdealer-/

Processing https://djinni.co/jobs/542518-senior-aqa-python-business-trip-to-belgium-fo/

Processing https://djinni.co/jobs/606103-junior-recruiter-non-it-/

Processing https://djinni.co/jobs/596548-ruby-on-rails-developer/

Processing https://djinni.co/jobs/606099-email-marketolog/

Processing https://djinni.co/jobs/606098-backend-net-developer-krakow/

Processing https://djinni.co/jobs/606094-full-stack-node-js-react-developer/

Processing https://djinni.co/jobs/606095-solution-advisor-for-artificial-intelligence-/

Processing https://djinni.co/jobs/606093-backend-software-engineer/

Processing https://djinni.co/jobs/606092-strong-junior-full-stack-developer-react-js-n/

Processing https://djinni.co/jobs/591728-sales-development-representative/

Processing https://djinni.co/jobs/606086-seo-specialist/

Processing https://djinni.co/jobs/606090-digital-marketing-manager/

Processing https://djinni.co/jo