In [61]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
from fake_useragent import UserAgent
import json

In [62]:

technologies = [
    "JavaScript",
    ".NET",
    "SQL",
    "Java",
    "Python",
    "React",
    "AWS",
    "TypeScript",
    "HTML",
    "Angular",
    "Azure",
    "PHP",
    "C++",
    "Android",
    "Kotlin",
    "Vue.js",
    "iOS",
    "Golang",
    "Spark",
    "Scala",
    "C",
    "Hadoop",
    "Ruby on Rails",
    "Ruby",
    "Flutter",
    "Elixir",
    "C#",
    "React native",
]


In [80]:
LANGUAGE = 'en'
UA = UserAgent()

In [84]:
def strip_whitespace(text):
    return text.strip()

def request_page(url, time_wait=0.0, cookies=None, headers=None, max_tries=5):
    if headers is None:
        headers= {'User-Agent': str(UA.random)}

    time.sleep(time_wait)
    response = requests.get(url, cookies=cookies, headers=headers)

    if response.status_code != 200:
        if max_tries == 0:
            raise RuntimeError(f"Error: {response.status_code} with Url: {url}")
        
        # wait more time, change cookies and headers and try again
        time_wait = 1.0 if time_wait == 0.0 else time_wait * 2
        print(f"Error: {response.status_code} with Url: {url}. Waiting {time_wait} seconds")
        return request_page(url, time_wait=time_wait, max_tries=max_tries-1)
    else:
        cookies = response.cookies

    return response, cookies, headers

<h1>Scraping offer Links</h1>

In [37]:
base_url = 'https://nofluffjobs.com/pl/{technology}?lang={language}&page={page_nr}'


all_offers = {}
unique_id = 1
cookies = None
headers = None

for tech in tqdm(technologies, desc="Tech iteration", position=0):
    url = base_url.format(technology=tech, language=LANGUAGE, page_nr=1)
    response, cookies, headers = request_page(url, cookies=cookies, headers=headers)
    assert response.status_code == 200, f'The request to {url} failed with code {response.status_code}'

    soup = BeautifulSoup(response.text, 'lxml')
    n_pages = int(soup.find_all('a', {'class': 'page-link'})[-2].text)

    for page_nr in tqdm(range(1, n_pages + 1), desc="Pages iteration", position=1):
        current_url = base_url.format(technology=tech, language=LANGUAGE, page_nr=page_nr)
        response, cookies, headers = request_page(current_url, cookies=cookies, headers=headers)
        assert response.status_code == 200, f'The request to {current_url} failed with code {response.status_code}'

        soup = BeautifulSoup(response.text, 'lxml')
        job_offers = soup.find_all('a', {'class': 'posting-list-item'})

        for job_offer in job_offers:
            link = job_offer['href']
            job_offer_id = link.split('/')[-1]

            details = job_offer.find('nfj-posting-item-title', {'class': 'align-items-lg-center'})
            title = details.find('h3').text.strip()
            company = details.find('span', {'class': 'd-block'}).text.strip()

            if job_offer_id not in all_offers:
                all_offers[job_offer_id] = {'Id': unique_id, 'JobTitle': title, 'Company': company, 'Url': link, 'Technologies': [tech]}
                unique_id += 1
            else:
                all_offers[job_offer_id]['Technologies'].append(tech)
            

Pages iteration: 100%|██████████| 26/26 [00:46<00:00,  1.77s/it]
Pages iteration: 100%|██████████| 7/7 [00:12<00:00,  1.86s/it]
Pages iteration: 100%|██████████| 26/26 [00:45<00:00,  1.74s/it]
Pages iteration: 100%|██████████| 20/20 [00:35<00:00,  1.76s/it]
Pages iteration: 100%|██████████| 20/20 [00:34<00:00,  1.74s/it]
Pages iteration: 100%|██████████| 11/11 [00:19<00:00,  1.79s/it]
Pages iteration: 100%|██████████| 16/16 [00:28<00:00,  1.79s/it]
Pages iteration: 100%|██████████| 10/10 [00:17<00:00,  1.77s/it]
Pages iteration: 100%|██████████| 10/10 [00:16<00:00,  1.67s/it]
Pages iteration: 100%|██████████| 9/9 [00:15<00:00,  1.75s/it]
Pages iteration: 100%|██████████| 15/15 [00:25<00:00,  1.71s/it]
Pages iteration: 100%|██████████| 5/5 [00:08<00:00,  1.66s/it]]
Pages iteration: 100%|██████████| 8/8 [00:13<00:00,  1.74s/it]]
Pages iteration: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]]
Pages iteration: 100%|██████████| 3/3 [00:04<00:00,  1.64s/it]]
Pages iteration: 100%|██████████|

In [46]:
with open('../data/all_offers.json', 'w') as f:
    json.dump(all_offers, f, indent=1)

<h1>Scraping offer descriptions</h1>

In [65]:
with open('../data/all_offers.json', 'r') as f:
    all_offers = json.load(f)

In [97]:
cookies = None
headers = None
base_url = 'https://nofluffjobs.com/{offer_link}?lang={lang}'

for key, job_offer in tqdm(all_offers.items()):
    url = base_url.format(offer_link=job_offer['Url'], lang=LANGUAGE)
    response, cookies, headers = request_page(url, cookies=cookies, headers=headers)
    assert response.status_code == 200, f'The request to {url} failed with code {response.status_code}'

    soup = BeautifulSoup(response.text, 'lxml')
    offer_description = soup.find('common-posting-content-wrapper').find('div', {'class': 'border'})
    offer_description = offer_description.strip()
    all_offers['']

  0%|          | 0/1398 [00:00<?, ?it/s]


In [98]:
all_offers

{'tools-developer-javascript-java-borealis-engineering-solutions-budapest': {'Id': 1,
  'JobTitle': 'Tools developer (Javascript, Java)',
  'Company': 'Borealis Engineering Solutions',
  'Url': '/pl/job/tools-developer-javascript-java-borealis-engineering-solutions-budapest',
  'Technologies': ['JavaScript', 'Java', 'C++', 'Vue.js', 'C']},
 'junior-javascript-developer-4-the-player-remote': {'Id': 2,
  'JobTitle': 'Junior JavaScript Developer',
  'Company': '4 The Player',
  'Url': '/pl/job/junior-javascript-developer-4-the-player-remote',
  'Technologies': ['JavaScript', 'TypeScript', 'HTML']},
 'medior-javascript-developer-gg-development-kft--budapest': {'Id': 3,
  'JobTitle': 'Medior Javascript Developer',
  'Company': 'GG Development Kft.',
  'Url': '/pl/job/medior-javascript-developer-gg-development-kft--budapest',
  'Technologies': ['JavaScript',
   'SQL',
   'React',
   'TypeScript',
   'HTML',
   'Angular']},
 'javascript-developer-gaming-solutions-zone-it-wroclaw': {'Id': 4,
 

In [96]:
print(offer_description.text.strip())

Tools developer (Javascript, Java)  Borealis Engineering Solutions  Category:  Fullstack ,  JavaScript  Mid Must have Java
 JavaScript
 Design Patterns
 CI
 Git
 Maven
 Jenkins
 UI
 English (C2)Nice to have C
 C++
 Eclipse
 Requirements description 
BSc or MSc in Electrical Engineering or Computer Science or similar with 5+ years of experience in software development
Java programming language knowledge
Javascript programming language knowledge
Good working knowledge of object-oriented design and design patterns
Strong knowledge of with version control and CI tools (GIT, Maven, Jenkins).
Strong analytical and communication skills
Good comand in english, both written and oral communication

The following qualifications will be considered a plus:

Experience with Vue.js and Quasar or other front end UI developer frameworks.
Experience with Eclipse RCP (Rich Client Platform) development.
Familiar with Eclipse plug-in development across multiple operating systems

The following qualificatio