# Installing all the needed packages (Run this only if you're using the Google Collab)

In [None]:
# apt-get handle the installation and removal of software on Debian, Ubuntu, and related Linux distributions
!apt-get update

# installing needed packages
!apt install chromium-chromedriver
!pip install selenium
!pip install beautifulsoup4

# Copy the chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import sys
# This make sure that the selenium chromedriver is in the path
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# Settings all imports

In [4]:
import tqdm
import requests
import urllib.request

import numpy as np
import pandas as pd

from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from bs4.element import Comment
# from google.colab import drive

# Moutning Google Drive device

In [5]:
# drive.mount("/content/drive")

# Create a web driver based on operating system. Please change the value of the `OPERATING_SYSTEM` constant to run this notebook on your system

In [6]:
OPERATING_SYSTEM = "windows"

if OPERATING_SYSTEM == "windows":
    executable_path = "drivers/windows/chromedriver.exe"
elif OPERATING_SYSTEM == "mac":
    executable_path = "drivers/mac/chromedriver.exe"
elif OPERATING_SYSTEM == "linux":
    executable_path = "chromedriver"

base_url = 'https://www.jobs.bg/'

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path=executable_path, options=chrome_options)

url = 'https://www.jobs.bg/front_job_search.php?frompage=0&add_sh=1&from_hp=1&term=#paging'
r = driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, 'html.parser')

In [7]:
def tag_visible(element):

    not_visible_elements = ['style', 'script', 'head', 'title', 'meta']
    if element.parent.name in not_visible_elements:
        return False

    if isinstance(element, Comment):
        return False

    return True

def text_from_soup(soup):
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    text_string = u" ".join(visible_text.strip() for visible_text in visible_texts)
    text_string = text_string.strip()
    return text_string

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    text_string = text_from_soup(soup)
    return text_string

def extract_technologies(text_info_elem):
  
    technologies = []

    for a in text_info_elem.find_all('img'):
        current_technology = a.attrs['alt']
        technologies.append(current_technology)

    technologies = ", ".join(technologies)

    return technologies

def extract_rating_color(rating_elem):
  
    rating_color = np.nan
  
    if rating_elem:
        rataing_elem_styles = rating_elem["style"].split(";")
        rating_elem_color_styles = [x for x in rataing_elem_styles if "color:" in x]
        rating_color = rating_elem_color_styles[0].split(":")[1]
    
    return rating_color

def extract_rating_value(rating_elem, full_start_string="\uf4b3", half_start_string="\uf4b1"):
  
    rating = np.nan

    if rating_elem:
        full_stars_count = rating_elem.text.strip().count(full_start_string)
        half_stars_count = rating_elem.text.strip().count(half_start_string)
        rating = full_stars_count + (half_stars_count / 2)

    return rating

def extract_job_date(text_info_elem):

    class_date_elem = "card__subtitle mdc-typography mdc-typography--overline line-height-1-5"
    date_elem = text_info_elem.find('span', attrs={'class': class_date_elem})
  
    if date_elem:
        job_date = date_elem.text.strip()

        if job_date == 'днес':
            job_date = date.today().strftime("%d.%m.%y")
        elif job_date == 'вчера':
            job_date = (date.today() - timedelta(days=1)).strftime("%d.%m.%y")
    
    return job_date

    return np.nan

def extract_job_text(position_soup):

    job_text = np.nan

    iframe_elem = position_soup.find('iframe', attrs={'id': 'customJobIframe'})
    raw_html_elem = position_soup.find('td', attrs={'class': 'jobTitle'})

    if iframe_elem:
        iframe_url = iframe_elem.attrs['src']
        job_http_response = urllib.request.urlopen(iframe_url)
        job_text = text_from_html(job_http_response)
    elif raw_html_elem:
        raw_html_elem = raw_html_elem.parent.parent
        raw_soup = BeautifulSoup(raw_html_elem.text, 'html.parser')
        job_text = text_from_soup(raw_soup)
  
    return job_text

def get_pages(soup):
    """Get the total number of job offerings and the total number of pages.
    output: total number of pages
    """
    
    td_searched_style = "height:25px;width:220px;font-weight:500;padding-bottom:5px;"
    total_job_number = soup.find('td', attrs={'style': td_searched_style}).text.split()[-1]
    total_job_number = int(total_job_number)
    total_pages = int(total_job_number / 15) + 1 # round up

    return total_pages

def extract_job_info_as_array(base_url, text_info_elem, job_company_elem):

    position_id = text_info_elem.find('a',attrs={'class':'card__title mdc-typography mdc-typography--headline6 text-overflow'})['href'].lstrip('job/')
    position_url = base_url + text_info_elem.find('a',attrs={'class':'card__title mdc-typography mdc-typography--headline6 text-overflow'})['href']
    company_name = job_company_elem.text.strip()

    _ = driver.get(position_url)
    job_html = driver.execute_script("return document.documentElement.outerHTML")
    position_soup = BeautifulSoup(job_html, 'html.parser')

    position_views = position_soup.find('span', string='Разглеждания:').parent.text.split()[-1]
    job_text = extract_job_text(position_soup)
    job_date = extract_job_date(text_info_elem)

    rating_elem = text_info_elem.find('span', attrs={'class':'iconed'})
    rating_color = extract_rating_color(rating_elem)
    rating = extract_rating_value(rating_elem)

    position = text_info_elem.find('a', attrs={'class': 'card__title mdc-typography mdc-typography--headline6 text-overflow'}).text.strip()
    benefits = text_info_elem.find('span', attrs={'class': 'card__subtitle mdc-typography mdc-typography--body2 top-margin'}).text.strip()
    technologies_string = extract_technologies(text_info_elem)

    current_job_info = [position_id, position_url, company_name, position_views, job_text, 
                        job_date, rating, rating_color, position, benefits, technologies_string]
    
    return current_job_info

In [8]:
def extract_jobs(start_page, end_page, category_id):

    jobs_list = []

    for page in tqdm.tqdm(range(start_page, end_page)):

        try:
            # This is the base url used when no filters by categories are applied
            url = "https://www.jobs.bg/front_job_search.php?frompage={}&add_sh=1&from_hp=1&term=#paging".format(page*15)
          
            if category_id:
                url = "https://www.jobs.bg/front_job_search.php?frompage={}&add_sh=1&categories%5B0%5D={}&term=#paging".format(page*15, category_id)
          
            r = driver.get(url)
            html = driver.execute_script("return document.documentElement.outerHTML")
            soup = BeautifulSoup(html, 'html.parser')
            job_infos = soup.find_all("td", attrs = {"class": "offerslistRow"})
        except Exception as e:
            print(f"Page number: {page}, url: {url}")
            print(f"Unexpected error: {e.message}")

        for i in range(0, 45, 3):
            try:
                text_info_elem = job_infos[i]
                job_logo_elem = job_infos[i+1]
                job_company_elem = job_infos[i+2]

                current_job_info = extract_job_info_as_array(base_url, text_info_elem, job_company_elem)
                jobs_list.append(current_job_info)
            except Exception as e:
                print(f"Curent page: {page}, problem with the {i}, {i+1}, {i+2} tds")
                print(f"Unexpected error: {e}")

    df_columns = ["position_id", "position_url", "company_name", "position_views", "job_description",
                  "date_posted", "position_rating", "rating_color", "position", "benefits", "technologies"]

    jobs_df = pd.DataFrame(jobs_list, columns=df_columns)

    return jobs_df

# Extracting the category names and ids so we can perform filters based on category

In [9]:
def extract_categories_dataframe():
  
    url = "https://www.jobs.bg/index.php"
    r = driver.get(url)
    categories_button = driver.find_element_by_id("categoriesChip")
    categories_button.click()

    html = driver.execute_script("return document.documentElement.outerHTML")
    soup = BeautifulSoup(html, 'html.parser')

    categories_elem = soup.find('div',attrs={'id':'categoriesSelectSheet'})
    categories_elements = categories_elem.find_all("div", attrs={"class": "mdc-chip mdc-ripple-upgraded"})

    categories_info = []
    categories_info.append((56, "IT JOBS"))

    for category_element in categories_elements:
        category_id = category_element['id'].split("_")[-1]
        category_name = category_element.text.replace("\n", "").strip()
        category_info = (category_id, category_name)
        categories_info.append(category_info)

    categories_info = pd.DataFrame(categories_info, columns=["category_id", "category_name"])

    return categories_info

In [10]:
categories_info = extract_categories_dataframe()
categories_info

Unnamed: 0,category_id,category_name
0,56,IT JOBS
1,36,"Ресторанти, Заведения, Хотели, Туризъм"
2,45,Търговия и Продажби
3,57,Производство
4,6,"Шофьори, Куриери, Транспорт, Логистика"
5,38,"Административни, Офис и Бизнес дейности"
6,3,"Архитектура, Строителство"
7,29,Инженери и Техници
8,52,Физически/Ръчен труд
9,17,Центрове за обслужване на клиенти и бизнес усл...


In [11]:
df = extract_jobs(start_page=0, end_page=10, category_id=56)
df

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.49s/it]


Unnamed: 0,position_id,position_url,company_name,position_views,job_description,date_posted,position_rating,rating_color,position,benefits,technologies
0,5670852,https://www.jobs.bg/job/5670852,БАКБОУН (БГ) ЕООД,630,IT support Technician (Night Shift) Office Bas...,07.05.21,5.0,#66c1ff,IT support Technician (Night Shift) Office Bas...,София; Заплата от 1800 до 2000 BGN (Нето),"Android, Windows, Active Directory"
1,5663138,https://www.jobs.bg/job/5663138,ТЕК ЕКСПЪРТС ЕООД,259,html Customer Support Specialist ...,07.05.21,5.0,#66c1ff,Customer Support Specialist with English and G...,София,
2,5669897,https://www.jobs.bg/job/5669897,ГРАФИКСОФТ ООД,268,html Grafixoft has 23 years of experience ...,07.05.21,5.0,#66c1ff,SaaS Product System Administrator,София,"AWS, Windows, Linux"
3,5680444,https://www.jobs.bg/job/5680444,MentorMate Bulgaria Ltd.,206,html .NET Team Lead Note: We offe...,07.05.21,5.0,#66c1ff,.NET Team Lead,София; Възможност за работа от вкъщи; Дистанци...,"JavaScript, .NET, Entity Framework, ASP.NET, H..."
4,5672110,https://www.jobs.bg/job/5672110,Си Екс Джи ООД,129,html Technical Support with Germa...,07.05.21,5.0,#66c1ff,Technical Support with German and English,София; Дистанционно интервю,
...,...,...,...,...,...,...,...,...,...,...,...
145,5661944,https://www.jobs.bg/job/5661944,Sixty K Plc - International Contacts Centres,106,html 60K's story is a proof that ...,29.04.21,5.0,#66c1ff,Technical Support Specialist Level I – chat su...,София; Възможност за работа от вкъщи; Дистанци...,"Английски, JavaScript, HTML/CSS, SQL"
146,5661075,https://www.jobs.bg/job/5661075,Sixty K Plc - International Contacts Centres,837,html 60K's story is a proof that...,29.04.21,5.0,#66c1ff,Senior Tier 1 Support Agent,София; Възможност за работа от вкъщи; Дистанци...,"Английски, HTML/CSS"
147,5653591,https://www.jobs.bg/job/5653591,LS Consulting and Services Ltd.,029,html Start-up feeling with security of glo...,29.04.21,5.0,#66c1ff,Java developer,София; Възможност за работа от вкъщи; Дистанци...,"Java, SQL"
148,5673588,https://www.jobs.bg/job/5673588,НЕГОМЕТРИКС БЪЛГАРИЯ ЕООД,265,html Do you want to be the guardian of qual...,29.04.21,5.0,#66c1ff,Software QA Engineer (Automation testing),София,"C#, Java, Selenium, SQL"


In [None]:
# df.to_csv("/content/drive/My Drive/jobsbg_scrapped/it_jobs_20210509.csv", index=False)