In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import urllib.robotparser
from itertools import chain
import json
#from gensim.summarization import summarize problem z instalacja wiec innego
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

In [3]:
def summarize_text(text, language, sentences_count = 10):
    without_html = re.sub(re.compile('<.*?>'), '', str(text)).replace('"', '-') 
    parser = PlaintextParser.from_string(without_html, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

In [4]:
def get_location(company_name):
        # Base URL for Nominatim API
        base_url = "https://nominatim.openstreetmap.org/search"
        # Parameters for the search query
        params = {
            "q": company_name,
            "format": "json",
            "addressdetails": 1  # Include address details in the response
        }
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
        }
        # Make the request to Nominatim API
        response = requests.get(base_url, params = params, headers = headers)
        data = response.json()

        # Check if any results were found
        if data:
            # Extract location information from the first result
            location = {
                "display_name": data[0]["display_name"]
            }
            return location
        else:
            return None

In [5]:
def it_pracuj():
    links_jump = []
    url_page = "https://it.pracuj.pl/praca/krakow;wp?rd=0&et=17%2C4%2C18&sal=1&its=big-data-science"
    response_page = requests.get(url_page)
    soup_page = BeautifulSoup(response_page.text, 'html.parser')
    pages = soup_page.select('.listing_n1mxvncp.listing_n1mxvncp')
    if pages:
        pages_number = 2
    else:
        pages_number = 1
    for i in range(1, pages_number + 1):
        url = "https://it.pracuj.pl/praca/krakow;wp?rd=0&et=17%2C4%2C18&sal=1&its=big-data-science&pn=" + str(i) 
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            pracuj_header = soup.select('.c1fljezf .core_n194fgoq')
            links = [link['href'] for link in pracuj_header if 'href' in link.attrs]
            pattern = re.compile(r'^https://www.pracuj.pl/praca/')
            filtered_links = [link for link in links if pattern.match(link)]
            links_jump.extend(filtered_links)
        else:
            print("Failed to it.pracuj.pl")
            return None
    links_jump = list(set(links_jump))
    #print(links_jump)
    return jump_to_link_pracuj(links_jump)

In [6]:
def jump_to_link_pracuj(links):
    jobs_data = []
    source = "it.pracuj.pl"
    category = "BigData/Data Science"
    currency = "PLN"
    for link in links:
        response = requests.get(link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            position = soup.select('.offer-viewzJYJpV .offer-viewkHIhn3')[0].contents[0]

            company = soup.select('.offer-viewwtdXJ4')[0].contents[0]
            
            kind = soup.select('.offer-viewSGW6Yi')[0].contents[0]
            mth_or_h = re.search(r'\b(hr|mth|mies|godz)\.$', kind).group(1)
            gross_or_net = re.search(r'\b(brutto|gross|netto|net)\b', kind).group(1)
            
            min_salary = soup.select('.offer-viewZGJhIB')
            if min_salary:
                min_salary = min_salary[0].contents[0]
                min_salary = re.sub(r'[^\d,.]', '', min_salary)
                min_salary = min_salary.replace(',', '.')
                min_salary = min_salary.split('.')[0]
                min_salary = int(min_salary)
                max_salary = soup.select('.offer-viewYo2KTr')[0].contents[0]
                max_salary = re.sub(r'[^\d,.]', '', max_salary)
                max_salary = max_salary.replace(',', '.')
                max_salary = max_salary.split('.')[0]
                max_salary = float(max_salary)
                if mth_or_h in ['godz', 'hr']:
                    if gross_or_net in ['netto', 'net']:
                        min_salary *= 168 * 1.23
                        max_salary *= 168 * 1.23
                    else:
                        min_salary *= 168
                        max_salary *= 168
                else:
                    if gross_or_net in ['netto', 'net']:
                        min_salary *= 1.23
                        max_salary *= 1.23
                min_salary = round(min_salary)
                max_salary = round(max_salary)
            else:
                min_salary_element = soup.select('.offer-viewYo2KTr')
                if min_salary_element:
                    min_salary = min_salary_element[0].contents[0]
                else:
                    min_salary = None
                    max_salary = None
                max_salary = None

            seniority_str = soup.select('.offer-viewXo2dpV')[2].contents[0]
            if ',' in seniority_str:  
                if 'ekspert' in seniority_str: 
                    seniority = seniority_str.split(',')[0]
                else:
                    seniority = seniority_str.split(',')[1]
            else:  
                seniority = seniority_str

            seniority = seniority.replace("starszy specjalista (Senior)", "Senior").replace("specjalista (Mid / Regular)", "Mid").replace("junior specialist (Junior)", "Junior")
            seniority = seniority.replace('senior specialist (Senior)', "Senior").replace(" senior specialist (Senior)", "Senior").replace(" mÅ\x82odszy specjalista (Junior)", "Junior").replace('specialist (Mid / Regular)', 'Mid')
            skills_all = soup.select(".offer-viewfjH4z3:first-of-type .offer-viewU0gxPf")
            skills = [skill.text.strip() for skill in skills_all]
            job_data = {
                "Źródło": source,
                "Link": link,
                "Pozycja": position,
                "Firma": company,
                "Min salary": min_salary,
                "Max salary": max_salary,
                "Currency": currency,
                "Skills": skills,
                "Category": category,
                "Seniority": seniority,
                "Adres": get_location(company),
                "Podsumowanie": summarize_text(soup, language = 'English'), 
            }
            jobs_data.append(job_data)
        else:
            print(f"Nie można pobrać strony: {link}")
    return jobs_data

In [7]:
def justjoin_it():
    url = "https://justjoin.it/krakow/data/experience-level_junior.mid.senior/with-salary_yes"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        justjoin_header = soup.select('.css-4lqp8g ')
        links = [link['href'] for link in justjoin_header if 'href' in link.attrs]
        links = ["https://justjoin.it" + link for link in links]
    else:
        print("Failed to it.pracuj.pl")
        return None
    return jump_to_link_justjoin(links)

In [8]:
def jump_to_link_justjoin(links):
    jobs_data = []
    source = "justjoin.it"
    category = "Data"
    for link in links:
        response = requests.get(link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            position = soup.select('.css-wx6bq4')[0].contents[-1].text
            
            company = soup.select('.css-u51ts9>*:not(:last-of-type)')[0].contents[1]
            
            seniority = soup.select('.css-15wyzmd')[1].contents[0]
            seniority = seniority.replace(" Senior", "Senior")

            min_salary = float(soup.select('.css-1pavfqb')[0].contents[0].text.replace(" ", "")) 
            max_salary = float(soup.select('.css-1pavfqb')[0].contents[2].text.replace(" ", ""))
            currency = soup.select('.css-1pavfqb')[0].contents[5]

            skills_all = soup.select('.css-x1xnx3')
            skills = [skill.text.strip() for skill in skills_all]
            job_data = {
                "Źródło": source,
                "Link": link,
                "Pozycja": position,
                "Firma": company,
                "Min salary": min_salary,
                "Max salary": max_salary,
                "Currency": currency,
                "Skills": skills,
                "Category": category,
                "Seniority": seniority,
                "Adres": get_location(company),
                "Podsumowanie": summarize_text(soup, language = 'English')
                
            }
            jobs_data.append(job_data)
        else:
            print(f"Nie można pobrać strony: {link}")
    return jobs_data

In [9]:
def main():
    pracuj = it_pracuj()
    justjoin = justjoin_it()
    if pracuj is not None and justjoin is not None:
        all_jobs = pracuj + justjoin
        for idx, job in enumerate(all_jobs, start = 1):
            job["Id"] = idx
        
        with open('jobs_data.json', 'w', encoding = 'utf-8') as f:
            json.dump(all_jobs, f, indent = 4, ensure_ascii = False)

In [10]:
if __name__ == "__main__":
    main()