In [6]:
!python -m pip install bs4 selenium webdriver_manager

Collecting webdriver_manager
  Obtaining dependency information for webdriver_manager from https://files.pythonhosted.org/packages/b1/51/b5c11cf739ac4eecde611794a0ec9df420d0239d51e73bc19eb44f02b48b/webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
import csv
import os
import importlib  
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import get_output_paths

def fill_job_template(url=None, company_name=None, job_title=None, salary=None, location=None, description=None, views=None, responses=None, category=None, employment=None, experience=None, english=None, domain=None):
    return  {
        "URL" : url if url else "",
        "Company" : company_name if company_name else "",
        "Job" : job_title if job_title else "", 
        "Salary" : salary if salary else "",
        "Location" : location if location else "",
        "Views" : views if views else "",
        "Responses" : responses if responses else "",
        "Category" : category if category else "",
        "Employment" : employment if employment else "",
        "Experience" : experience if experience else "",
        "English" : english if english else "",
        "Domain" : domain if domain else "",
        "Description" : description if description else "",
    }

def make_output_path(filename:str) -> str:
    output_path, output_dir = get_output_paths(filename)
    os.makedirs(output_dir, exist_ok=True)
    file = open(output_path, "w")
    csv_writer = csv.DictWriter(file, fieldnames=fill_job_template())
    csv_writer.writeheader()
    file.close()
    return output_path






In [61]:
from bs4 import BeautifulSoup

def parse_html(html, url):
  try:
    soup = BeautifulSoup(html, 'html.parser')
    company_name = soup.select_one('.job-details--title').text.strip()
    job_title = soup.select_one('div.detail--title-wrapper h1').get_text(strip=True, separator=';').split(';')[0]
    salary_elem = soup.select_one('div.detail--title-wrapper h1 span')
    salary = salary_elem.text.replace("від", "").replace("$", "").strip() if salary_elem else ""
    location = soup.select_one('div.job-additional-info--item-text span.location-text').text.strip().replace("\n", " ")
    description = soup.select_one('div.mb-4').text.strip()
    views = int(soup.select_one('span.bi.bi-eye').next_sibling.strip().split()[0])
    responses = int(soup.select_one('span.bi.bi-people-fill').next_sibling.strip().split()[0])
    category = soup.select_one('li.job-additional-info--item:contains("Категорія:")').find_all('span')[1].text.strip().replace("\n", " ")
    
    employment_selector = ", ".join(f"li.job-additional-info--item:contains(\"{x}\")" for x in ["Office", "Remote", "Гібридна", "віддалено", "офіс"])
    employment = soup.select_one(f"{employment_selector} div").text.strip().replace("\n", "")
    experience_text = soup.select_one('li.job-additional-info--item:contains("досвіду") div').text.strip().split()
    experience = 0 
    try:
       experience = int(experience_text[0])
    except:
       experience = 0
    english_elem = soup.select_one('li.job-additional-info--item:contains("Англійська") div')
    english = english_elem.text.strip().replace("\n", " ") if english_elem else ""
    
    domain_elem = soup.select_one('li.job-additional-info--item:contains("Домен") div')
    domain = domain_elem.text.strip().replace("\n", " ") if domain_elem else ""

    return fill_job_template(
      url,
      company_name,
      job_title,
      salary,
      location,
      description,
      views,
      responses,
      category,
      employment,
      experience,
      english,
      domain
    )
  except Exception as e:
      print('parse_html',url, e )


def format_output(result: dict):
   return "\n".join([f"{k}:{v}" for k,v in result.items()]) + "\n\n"


In [62]:
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import JOBS_URL, URL
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import os

def write_job_links(output_path, job_links):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(job_links)

def write_job(output_path, job_info):
    with open(output_path, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=fill_job_template())
        csv_writer.writerow(job_info)

def get_pages_urls():
    response = requests.get(JOBS_URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    last_li = soup.find('ul', class_='pagination pagination_with_numbers').find_all('li')[-2]
    number_of_pages = int(last_li.text.strip())
    urls = [f"{JOBS_URL}/?page={page}" for page in range(1, number_of_pages + 1)]
    return urls

def get_job_urls(url):
    print(f"processing", url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    job_links = soup.select('.job-list-item__link')
    return [f"{URL}{link['href']}" for link in job_links]


def get_job_urls_multi():
    pages = get_pages_urls()
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        job_urls = list(executor.map(get_job_urls, pages))
    job_urls = [url for sublist in job_urls for url in sublist]
    return job_urls






In [64]:
import concurrent.futures
import importlib  
try:
    constants = importlib.import_module("knowledge-bases.hw1.constants")
except:
    print('')
from constants import OUTPUT_FILE_NAME

def process_job_page(url, output_path):
        print(f"Processing {url}")
        response = requests.get(url)
        output = parse_html(response.text, url)
        write_job(output_path, output)


def write_job_links():
    with open(get_output_paths("jobs.txt")[0], "w") as file:
        job_urls = get_job_urls_multi()
        file.writelines([f"{url[:-1]}\n" for url in job_urls])

def read_job_links():
    with open(get_output_paths("jobs.txt")[0], "r") as file:
        return [url.strip() for url in file.readlines()]

def main():
    LIMIT = 1000
    output_path = make_output_path(OUTPUT_FILE_NAME)
    job_urls = read_job_links()
    job_urls = job_urls[:LIMIT] if LIMIT else job_urls
    print(f"Processing {len(job_urls)} pages: {', '.join(job_urls[:3])}\n...")
    # for job_url in job_urls:
    #     process_job_page(job_url, output_path)
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(process_job_page, job_urls, [output_path] * len(job_urls))
    print(output_path)





In [66]:
# main()