In [51]:
# !pip install selenium
# !pip install pymongo
# !pip install numpy
# !pip install BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient
from datetime import datetime
import time
import re
import numpy as np
from bs4 import BeautifulSoup



In [52]:
# Please check README.md first

class WebScraping:
    def __init__(self, query_term='business-analyst'):
        print('Initialise WebScraping instance')
        print('This time we will query with: ', query_term)
        # Corrected the URL to ensure the query term is properly embedded
        self.query_term = query_term
        self.target_url = f"https://nofluffjobs.com/pl/?lang=en&criteria=jobPosition%3D{self.query_term}"
        self.client = MongoClient('mongodb://localhost:27017/')
        self.db = self.client['BD_final']
        self.final_html=''

    def scrape_save_raw_to_db(self, clicks=10):
        driver = webdriver.Chrome()
        driver.maximize_window()
        driver.get(self.target_url)

        wait = WebDriverWait(driver, 10)

        # A. Handle Cookie pop-ups
        try:
            cookie_btn = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
            cookie_btn.click()
            print("Cookies accepted.")
        except:
            print("No cookie banner found or already accepted.")

        # B. Loop to click "See more offers" button 10 times
        count = 0
        while count < clicks:
            try:
                # Find the button with the nfjloadmore attribute
                load_more_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[nfjloadmore]")))

                # Scroll to the button position to ensure it is in the viewport
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_btn)
                time.sleep(1.5) # Allow some buffer time for scrolling

                # Click to load more
                if load_more_btn:
                    load_more_btn.click()
                count += 1
                print(f"Clicked 'See more' ({count}/{clicks})")

                # Delay for new content to load
                time.sleep(2.5)
            except Exception as e:
                print(f"Finished loading or button not found: {e}")
                break

        # C. Get the complete HTML after 10 clicks and save it
        print("All pages loaded. Capturing final HTML...")
        self.final_html = driver.page_source
        self.save_raw_to_mongodb()

        driver.quit()

    def save_raw_to_mongodb(self):
        """
        Reference your initial logic to store the raw HTML into NoSQL
        """
        # only hold one raw_json html data
        self.db.jobs_raw.drop()
        website_document = {
            'url': self.target_url,
            'content': self.final_html,
            'date': datetime.now(),
            'query_term': self.query_term
        }

        # Store in the jobs_raw collection
        result = self.db.jobs_raw.insert_one(website_document)
        print(f"Raw HTML saved to MongoDB! Document ID: {result.inserted_id}")

    def _parse_salary(self, salary_str):
        """
        Internal helper method: Parse salary string
        Example input: "10 000 - 15 000 PLN", "20 000 PLN", "Undisclosed"
        Output: (min_salary, max_salary)
        """
        if not salary_str or "Undisclosed" in salary_str or "Agreement" in salary_str:
            return None, None

        # Remove spaces and thousands separators
        clean_str = salary_str.replace('\xa0', '').replace(' ', '')

        # Match all digits
        numbers = re.findall(r'\d+', clean_str)

        try:
            if len(numbers) >= 2:
                # Range salary: [10000, 15000]
                return float(numbers[0]), float(numbers[1])
            elif len(numbers) == 1:
                # Fixed salary: [20000]
                return float(numbers[0]), float(numbers[0])
        except Exception:
            pass

        return None, None

    def _parse_location(self, card):
        """
        Internal helper method: Parse location
        """
        location_tag = card.select_one('span.posting-info__location, nfj-posting-item-city')
        if not location_tag:
            return "Unknown"

        loc_text = location_tag.get_text(strip=True)

        # Special case handling: Remote work
        if "Remote" in loc_text or "Zdalna" in loc_text:
            return "Remote"

        # Extract city name (some include '+1', filter via split)
        city = loc_text.split('+')[0].strip()
        return city

    def process_and_save(self):
        """
        Read HTML from jobs_raw and extract fields to store in jobs_processed
        extract raw html and parse fields then save into
        """
        # 1. Get the most recently scraped HTML document
        raw_data = self.db.jobs_raw.find_one(sort=[("date", -1)])
        if not raw_data:
            print("No raw data found in MongoDB!")
            return

        soup = BeautifulSoup(raw_data['content'], 'html.parser')

        # 2. Locate all job cards
        postings = soup.select('a.posting-list-item')
        print(f"Found {len(postings)} job postings in HTML.")

        processed_list = []
        base_domain = "https://nofluffjobs.com" # Used to concatenate the full URL

        for post in postings:
            try:
                # --- Field 1: Job Title ---
                title_el = post.select_one('h3.posting-title__position, .posting-title__can-hide')
                if title_el:
                    # Find and remove any potential "NEW" tags or other badges
                    # NFJ's badge class names usually contain title-badge
                    for badge in title_el.select('.title-badge, .title-badge--new'):
                        badge.decompose()
                    job_title = title_el.get_text(strip=True)
                else:
                    job_title = "N/A"
                print('JOB TITLE:', job_title)

                # --- Field 2: Company Name ---
                company_el = post.select_one('span.d-block, .company-name')
                company_name = company_el.get_text(strip=True) if company_el else "N/A"

                # --- Fields 3 & 4: Min/Max Salary ---
                salary_el = post.select_one('span.text-truncate, nfj-posting-item-salary')
                salary_str = salary_el.get_text(strip=True) if salary_el else ""
                min_sal, max_sal = self._parse_salary(salary_str)

                # --- Field 5: Location ---
                location = self._parse_location(post)

                # --- Field 6: Jump URL (New) ---
                # Get relative path from the <a> tag's href attribute and concatenate domain
                relative_url = post.get('href')
                jump_url = base_domain + relative_url if relative_url else "N/A"

                # Construct document
                job_doc = {
                    'job_title': job_title,
                    'company_name': company_name,
                    'min_salary': min_sal,
                    'max_salary': max_sal,
                    'location': location,
                    'jump_url': jump_url, # Add to the document
                    'processed_at': datetime.now(),
                    'query_term':raw_data['query_term']
                }

                processed_list.append(job_doc)

            except Exception as e:
                print(f"Error parsing a single post: {e}")
                continue

        # 3. Batch save to the new collection
        if processed_list:
            self.db.jobs_processed.drop()
            self.db.jobs_processed.insert_many(processed_list)
            print(f"Successfully processed {len(processed_list)} jobs and saved to 'jobs_processed'.")


# run the WebScrapper
# query and save the raw html data into mongodb
scraper = WebScraping(query_term='business-analyst')
scraper.scrape_save_raw_to_db()
# extract data, and parse to fields and save into db again
scraper.process_and_save()

Initialise WebScraping instance
This time we will query with:  business-analyst
No cookie banner found or already accepted.
Clicked 'See more' (1/10)
Clicked 'See more' (2/10)
Clicked 'See more' (3/10)
Clicked 'See more' (4/10)
Clicked 'See more' (5/10)
Clicked 'See more' (6/10)
Finished loading or button not found: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff69e7088e5
	0x7ff69e708940
	0x7ff69e4e165d
	0x7ff69e539a33
	0x7ff69e539d3c
	0x7ff69e58df67
	0x7ff69e58ac97
	0x7ff69e52ac29
	0x7ff69e52ba93
	0x7ff69ea20640
	0x7ff69ea1af80
	0x7ff69ea396e6
	0x7ff69e725de4
	0x7ff69e72ed8c
	0x7ff69e712004
	0x7ff69e7121b5
	0x7ff69e6f7ee2
	0x7ff8c6e2e8d7
	0x7ff8c7a4c53c

All pages loaded. Capturing final HTML...
Raw HTML saved to MongoDB! Document ID: 695bcaccc295a6d335a2164d
Found 123 job postings in HTML.
JOB TITLE: Business Analyst
JOB TITLE: Business Analyst
JOB TITLE: Business Analyst
JOB TITLE: Business Analyst
JOB TITLE: Business-System Analyst
JOB TITLE: Busin