In [None]:
# apt-get handle the installation and removal of software on Debian, Ubuntu, and related Linux distributions
!apt-get update

# installing needed packages
!apt install chromium-chromedriver
!pip install selenium
!pip install beautifulsoup4

# Copy the chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import sys
# This make sure that the selenium chromedriver is inthe path
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [2]:
import tqdm
import requests
import urllib.request

# This import is needed in order to run the notebook on mac OS
# import chromedriver_binary

import numpy as np
import pandas as pd

from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from bs4.element import Comment
from google.colab import drive

In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
base_url = 'https://www.jobs.bg/'

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)

url = 'https://www.jobs.bg/front_job_search.php?frompage=0&add_sh=1&from_hp=1&term=#paging'
r = driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, 'html.parser')

In [31]:
def tag_visible(element):

    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False

    if isinstance(element, Comment):
        return False

    return True

def text_from_soup(soup):
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    text_string = u" ".join(visible_text.strip() for visible_text in visible_texts)
    text_string = text_string.strip()
    return text_string

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    text_string = text_from_soup(soup)
    return text_string

def extract_technologies(text_info_elem):
  
  technologies = []

  for a in text_info_elem.find_all('img'):
      current_technology = a.attrs['alt']
      technologies.append(current_technology)

  technologies = ", ".join(technologies)

  return technologies

def extract_rating_color(rating_elem):
  
  rating_color = np.nan
  
  if rating_elem:
      rataing_elem_styles = rating_elem["style"].split(";")
      rating_elem_color_styles = [x for x in rataing_elem_styles if "color:" in x]
      rating_color = rating_elem_color_styles[0].split(":")[1]

  return rating_color

def extract_rating_value(rating_elem, full_start_string="\uf4b3", half_start_string="\uf4b1"):
  
  rating = np.nan
  
  if rating_elem:
      full_stars_count = rating_elem.text.strip().count(full_start_string)
      half_stars_count = rating_elem.text.strip().count(half_start_string)
      rating = full_stars_count + (half_stars_count / 2)

  return rating

def extract_job_date(text_info_elem):

  class_date_elem = "card__subtitle mdc-typography mdc-typography--overline line-height-1-5"
  date_elem = text_info_elem.find('span', attrs={'class': class_date_elem})
  
  if date_elem:
    job_date = date_elem.text.strip()

    if job_date == 'днес':
        job_date = date.today().strftime("%d.%m.%y")
    elif job_date == 'вчера':
        job_date = (date.today() - timedelta(days=1)).strftime("%d.%m.%y")
    
    return job_date

  return np.nan

def extract_job_text(position_soup):

  job_text = np.nan

  iframe_elem = position_soup.find('iframe', attrs={'id': 'customJobIframe'})
  raw_html_elem = position_soup.find('td', attrs={'class': 'jobTitle'})

  if iframe_elem:
      iframe_url = iframe_elem.attrs['src']
      job_http_response = urllib.request.urlopen(iframe_url)
      job_text = text_from_html(job_http_response)
  elif raw_html_elem:
      raw_html_elem = raw_html_elem.parent.parent
      raw_soup = BeautifulSoup(raw_html_elem.text, 'html.parser')
      job_text = text_from_soup(raw_soup)
  
  return job_text

def get_pages(soup):
    """Get the total number of job offerings and the total number of pages.
    output: total number of pages
    """
    
    td_searched_style = "height:25px;width:220px;font-weight:500;padding-bottom:5px;"
    total_job_number = soup.find('td', attrs={'style': td_searched_style}).text.split()[-1]
    total_job_number = int(total_job_number)
    total_pages = int(total_job_number / 15) + 1 # round up

    return total_pages

def extract_job_info_as_array(base_url, text_info_elem, job_company_elem):

    position_id = text_info_elem.find('a',attrs={'class':'card__title mdc-typography mdc-typography--headline6 text-overflow'})['href'].lstrip('job/')
    position_url = base_url + text_info_elem.find('a',attrs={'class':'card__title mdc-typography mdc-typography--headline6 text-overflow'})['href']
    company_name = job_company_elem.text.strip()

    _ = driver.get(position_url)
    job_html = driver.execute_script("return document.documentElement.outerHTML")
    position_soup = BeautifulSoup(job_html, 'html.parser')

    position_views = position_soup.find('span', string = 'Разглеждания:').parent.text.split()[-1]
    job_text = extract_job_text(position_soup)
    job_date = extract_job_date(text_info_elem)

    rating_elem = text_info_elem.find('span',attrs={'class':'iconed'})
    rating_color = extract_rating_color(rating_elem)
    rating = extract_rating_value(rating_elem)

    position = text_info_elem.find('a', attrs={'class': 'card__title mdc-typography mdc-typography--headline6 text-overflow'}).text.strip()
    benefits = text_info_elem.find('span', attrs={'class': 'card__subtitle mdc-typography mdc-typography--body2 top-margin'}).text.strip()
    technologies_string = extract_technologies(text_info_elem)

    current_job_info = [position_id, position_url, company_name, position_views, job_text, 
                        job_date, rating, rating_color, position, benefits, technologies_string]
    
    return current_job_info

In [32]:
  def extract_jobs(start_page, end_page):

    jobs_list = []

    for page in tqdm.tqdm(range(start_page, end_page)):

        try:
          url = f'https://www.jobs.bg/front_job_search.php?frompage={page*15}&add_sh=1&from_hp=1&term=#paging'
          r = driver.get(url)
          html = driver.execute_script("return document.documentElement.outerHTML")
          soup = BeautifulSoup(html, 'html.parser')
          job_infos = soup.find_all("td", attrs = {"class": "offerslistRow"})
        except Exception as e:
          print(f"Page number: {page}, url: {url}")
          print(f"Unexpected error: {e.message}")

        for i in range(0, 45, 3):
            try:
              text_info_elem = job_infos[i]
              job_logo_elem = job_infos[i+1]
              job_company_elem = job_infos[i+2]

              current_job_info = extract_job_info_as_array(base_url,text_info_elem, job_company_elem)
              jobs_list.append(current_job_info)
            except Exception as e:
              print(f"Curent page: {page}, problem with the {i}, {i+1}, {i+2} tds")
              print(f"Unexpected error: {e.message}")

    df_columns = ["position_id", "position_url", "company_name", "position_views", "job_description",
                  "date_posted", "position_rating", "rating_color", "position", "benefits", "technologies"]

    jobs_df = pd.DataFrame(jobs_list, columns=df_columns)

    return jobs_df

In [33]:
df = extract_jobs(49, 50)

100%|██████████| 1/1 [00:22<00:00, 22.53s/it]


In [34]:
df

Unnamed: 0,position_id,position_url,company_name,position_views,job_description,date_posted,position_rating,rating_color,position,benefits,technologies
0,5670542,https://www.jobs.bg/job/5670542,АБРИТЕС ООД,459,"We are a Bulgarian IT company, established in ...",28.04.21,5.0,#66c1ff,Graphic Designer,София,"Creative Cloud, Photoshop, Illustrator, InDesign"
1,5670713,https://www.jobs.bg/job/5670713,И:ФАО България ЕООД,144,"i:FAO is part of Amadeus IT group, one of the ...",28.04.21,5.0,#66c1ff,iOS Developer,София; Възможност за работа от вкъщи; Дистанци...,"iOS, Jenkins"
2,5670726,https://www.jobs.bg/job/5670726,Modis Bulgaria EOOD,749,Connecting businesses and tech talents in a fa...,28.04.21,5.0,#66c1ff,Vendor and Order Management Specialist,София; Възможност за работа от вкъщи; Дистанци...,Английски
3,5670656,https://www.jobs.bg/job/5670656,СОФАРМА ТРЕЙДИНГ АД,419,SAP Business Analyst Support Consultant We ar...,28.04.21,5.0,#66c1ff,SAP Business Analyst Support Consultant,София,SAP
4,5670711,https://www.jobs.bg/job/5670711,Самекс ЕООД / KFC,207,Фирма САМЕКС ЕООД оперира ресторантите на KFC ...,28.04.21,5.0,#66c1ff,Работник в ресторант,Виноградец,
5,5670603,https://www.jobs.bg/job/5670603,МЕНПАУЪР БЪЛГАРИЯ ООД,409,"Manpower is part of ManpowerGroup, a Fortune 5...",28.04.21,5.0,#66c1ff,React Developer,София; Възможност за работа от вкъщи; Заплата ...,"Английски, JavaScript, React"
6,5670688,https://www.jobs.bg/job/5670688,САЙТЕЛ БЪЛГАРИЯ ЕООД,167,Guided by 35+ years of experience and a custom...,28.04.21,5.0,#66c1ff,Turkish and English Speaking Customer Care Spe...,София; Дистанционно интервю,
7,5670672,https://www.jobs.bg/job/5670672,ЕЙЧ АР ЕС СЪРВИСИЗ БЪЛГАРИЯ ООД,148,"At HRS , we believe the right job can transfor...",28.04.21,5.0,#66c1ff,Data Engineer,София; Дистанционно интервю,"Английски, Python, SQL, MySQL, PostgreSQL, Tab..."
8,5670646,https://www.jobs.bg/job/5670646,Съдърланд Глобъл Сървисиз България ЕООД,58,Technical Support Consultant with Norwegian an...,28.04.21,5.0,#66c1ff,Technical Support Consultant with Norwegian an...,София,
9,5670640,https://www.jobs.bg/job/5670640,Съдърланд Глобъл Сървисиз България ЕООД,115,WE ARE GROWING! Travel advisor for Hilton flu...,28.04.21,5.0,#66c1ff,Travel advisor for Hilton fluent in Polish and...,София,


In [None]:
df.to_csv("/content/drive/My Drive/jobsbg_scrapped/df.csv", index=False)