In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
import random
import time
import re

In [57]:
categories = ['AI/ML', 'Python', 'Data+Engineer'] 
all_links = set() 
vacancies = []

In [58]:
def scrape_dou_jobs(category):
    chrome_options = Options()
    # chrome_options.add_argument("--headless")

    chrome_options.add_argument("--window-size=900,1000")
    chrome_options.add_argument("--window-position=960,0")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    url = f"https://jobs.dou.ua/vacancies/?category={category}"
    driver.get(url)

    try:
        while True:
            try:
                more_button = WebDriverWait(driver, 3).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, ".more-btn a"))
                )
                more_button.click()
                time.sleep(2)
            except:
                break

        vacancies_elements = driver.find_elements(By.CSS_SELECTOR, "li.l-vacancy")
        
        page_results = []
        for v in vacancies_elements:
            try:
                title_el = v.find_element(By.CSS_SELECTOR, "a.vt")
                company_el = v.find_element(By.CSS_SELECTOR, "a.company")
                
                link = title_el.get_attribute("href")
                
                item = {
                    "title": title_el.text,
                    "company": company_el.text,
                    "link": link,
                    "source_category": category
                }
                page_results.append(item)
            except:
                continue
                
        return page_results

    finally:
        driver.quit()

In [59]:
for cat in categories:
    raw_results = scrape_dou_jobs(cat)
    
    added_count = 0
    skipped_count = 0
    
    for job in raw_results:
        url = job['link']
        
        if url not in all_links:
            all_links.add(url)
            vacancies.append(job)
            added_count += 1
        else:
            skipped_count += 1
            
    print(f"Знайдено: {len(raw_results)}. Додано нових: {added_count}. Дублікатів: {skipped_count}.\n")

Знайдено: 256. Додано нових: 256. Дублікатів: 0.

Знайдено: 239. Додано нових: 217. Дублікатів: 22.

Знайдено: 105. Додано нових: 97. Дублікатів: 8.



In [60]:
def show_stats(vacs_list, count):
    safe_count = min(count, len(vacs_list))
    
    for i, v in enumerate(vacs_list[:safe_count]):
        company_name = v['company'].strip()
        print(f"{i+1}. {v['title']} '{company_name}'")
        print(f"   Category: {v.get('source_category', '-')}")

In [61]:
show_stats(vacancies, count=10)

1. AI Video Creator 'HOLYWATER TECH'
   Category: AI/ML
2. Motion Designer / AI Video Editor — PawChamp 'SKELAR'
   Category: AI/ML
3. Lead AI Engineer 'Spendbase'
   Category: AI/ML
4. Senior AI Engineer 'SIXT'
   Category: AI/ML
5. AI Video Creator 'AMO'
   Category: AI/ML
6. Senior Deep Learning Engineer (Computer Vision) 'Ajax Systems'
   Category: AI/ML
7. AI Designer Lead 'FREITTY'
   Category: AI/ML
8. AI Researcher 'Fuelfinance'
   Category: AI/ML
9. Machine Learning Tech Lead (AWS, LLMs) 'Provectus'
   Category: AI/ML
10. Senior Python/Data Engineer (Integrations, Data & AI Pipelines) 'Svitla Systems'
   Category: AI/ML


In [66]:
df = pd.DataFrame(vacancies)

total = len(df)

df['title'] = df['title'].str.strip()
df['char_length'] = df['title'].apply(len)
df['word_length'] = df['title'].apply(lambda x: len(x.split()))

mean_char = df['char_length'].mean()
med_char = df['char_length'].median()
mean_word = df['word_length'].mean()
med_word = df['word_length'].median()
counts = df['source_category'].value_counts()
percents = df['source_category'].value_counts(normalize=True)

print(f"Вакансій: {total}")

for cat in counts.index:
    count = counts[cat]
    pct = percents[cat] * 100
    print(f"{cat:<15} {count:>3} ({pct:>4.1f}%)")

print(f"\nСимволи: {mean_char:>6.1f} | {med_char:<4.1f}")
print(f"Слова:   {mean_word:>6.1f} | {med_word:<4.1f}")

Вакансій: 570
AI/ML           256 (44.9%)
Python          217 (38.1%)
Data+Engineer    97 (17.0%)

Символи:   32.5 | 30.0
Слова:      4.5 | 4.0 


In [68]:
def clean_text(text):
    if not isinstance(text, str):
        return str(text)
    
    text = text.replace("’", "'").replace("`", "'").replace("‘", "'")
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'\S+@\S+\.\S+', '<EMAIL>', text)
    text = re.sub(r'\+?[\d\-\(\)\s]{9,}', ' <PHONE> ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

test_str = "Send CV to  hr@company.com or visit https://dou.ua . Call +380 50 123 45 67.  We’re hiring! "
print(f"ДО:    '{test_str}'")
print(f"ПІСЛЯ: '{clean_text(test_str)}'")

df['title_cleaned'] = df['title'].apply(clean_text)

print(df[['title', 'title_cleaned']].head(5))

ДО:    'Send CV to  hr@company.com or visit https://dou.ua . Call +380 50 123 45 67.  We’re hiring! '
ПІСЛЯ: 'Send CV to <EMAIL> or visit <URL> . Call <PHONE> . We're hiring!'
                                          title  \
0                              AI Video Creator   
1  Motion Designer / AI Video Editor — PawChamp   
2                              Lead AI Engineer   
3                            Senior AI Engineer   
4                              AI Video Creator   

                                  title_cleaned  
0                              AI Video Creator  
1  Motion Designer / AI Video Editor — PawChamp  
2                              Lead AI Engineer  
3                            Senior AI Engineer  
4                              AI Video Creator  


In [69]:
def get_description(driver, url):
    try:
        driver.get(url)
        time.sleep(random.uniform(1.0, 2.0)) 
        desc_element = driver.find_element(By.CSS_SELECTOR, ".b-vacancy")
        return desc_element.text
    except Exception as e:
        return ""

In [None]:
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=900,1000")
chrome_options.add_argument("--window-position=960,0")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

try:
    for i, job in enumerate(vacancies):
        if 'description' in job and len(job['description']) > 10:
            continue
            
        url = job['link']
        desc_text = get_description(driver, url)
        job['description'] = desc_text

        if (i + 1) % 10 == 0:
            print(f"[{i+1}/{len(vacancies)}] Опрацьовано...")

except KeyboardInterrupt:
    print("\nСкрипт зупинено користувачем. Дані, що встигли зібрати - збережені.")

finally:
    driver.quit()