### Open a browser and set the cookies from a JSON file

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random, re, datetime
import pandas as pd
pd.options.mode.copy_on_write = True # recommended - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# ChromeDriver should match browser version. If outdated download from:
# https://googlechromelabs.github.io/chrome-for-testing/

def setCookiesFromJson():  
    try:
        DRIVER.get(BASE_URL) #RUN BROWSER
        currentUrlDomain = DRIVER.current_url
        currentUrlDomain = re.search(r'^https?://([^/]+)', currentUrlDomain)
        currentUrlDomain = currentUrlDomain.group(1)  
        currentUrlDomain = re.sub(r'^www\.', '', currentUrlDomain)
        currentUrlDomain = re.sub(r'^\.', '', currentUrlDomain)
        # print(currentUrlDomain)
        with open('cookies.json', 'r', newline='') as inputdata:
            cookies = json.load(inputdata)
            cookiesAdded = 0
            for cookie in cookies: #works only after driver.get
                if re.match(r".?"+currentUrlDomain, cookie['domain']): # can only add cookies for current domain
                    DRIVER.add_cookie(cookie)
                    cookiesAdded += 1
            if cookiesAdded > 0:
                DRIVER.refresh() # to load cookies
                return {'success':True, 'functionDone':True, 'message':'cookies for ' + currentUrlDomain + ' successfully set'}
            elif (cookiesAdded == 0):
                return {'success':False, 'functionDone':True, 'message':'no cookies for ' + currentUrlDomain + ' found in cookies.json'}
    except Exception as exception:
        return {'success':False, 'functionDone':True, 'message':str(exception)} # 'functionDone':True because it's not necessary

# service = Service(executable_path="chromedriver.exe")
service = Service(executable_path=r"C:\Users\Ya\Desktop\Kody\Phyhyton\selenium\chromedriver.exe") # for some reason relative path in notebook doesn't work any more

chrome_options = Options()

chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("window-size=800,1000")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
DRIVER = webdriver.Chrome(service=service, options=chrome_options)

BASE_URL = "https://justjoin.it/job-offers/bialystok"
DRIVER.get(BASE_URL)
setCookiesFromJson()

### Fetch the URLs from all the pages - JUSTJOIN

In [None]:
# def getLastOfferIndex():
#     try:
#         DRIVER.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll to the bottom
#         offersList = DRIVER.find_element(By.ID, 'up-offers-list')
#         offers = offersList.find_elements(By.XPATH, '//li[@data-index]') # amount depends on screen height 
#         lastIndex = offers[-1].get_attribute('data-index')
#         return lastIndex
#     except Exception as e:
#         print(e)
#         return

def offerNotFound():
    try:
        offerContent = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-tnvghs')
        topContainer = offerContent.find_element(By.CSS_SELECTOR, 'div') # 1st div
        # NEED TO FIND 'OFFER NOT FOUND MSG AND CHECK WHICH DIVS DOES IT HAVE
        # topDiv = topContainer.find_element(By.XPATH, ".//*[contains(@class, 'css-10x887j')]")
        return False # if topDiv found, offer is there
    except:
        return True

def anyOffersOnTheList():
    try:
        # offersList = DRIVER.find_element(By.ID, 'up-offers-list') # changed back to virtuoso ~15.01.2025 xD
        # offers = offersList.find_elements(By.XPATH, '//li[@data-index]') # changed back to virtuoso ~15.01.2025 xD

        offersList = DRIVER.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]') # changed to up-offers-list ~25.12.2024
        offers = offersList.find_elements(By.XPATH, 'div[@data-index]') # virtuoso approach
        # print(len(offers))
        if len(offers) > 0:
            return True
        else:
            return False
    except Exception as e:
        # print(e)
        return False

OFFERS_URLS = []

def fetchCurrentlyVisibleOffersUrls(): # just the ones currently rendered in browser
    try:
        offersList = DRIVER.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]') # changed to up-offers-list ~25.12.2024
        offers = offersList.find_elements(By.XPATH, 'div[@data-index]') # amount depends on screen height 
        for offer in offers: # ever-loading div among them
            try:
                index = offer.get_attribute('data-index')
                href = offer.find_element(By.XPATH, ".//div/div/a").get_property("href")

                def foundAmongSavedIndexes():
                    if len(OFFERS_URLS) == 0:
                        return False # no offers
                    for i in range (len(OFFERS_URLS[-30:])): # 30 last offers (or less if len < 30)
                        if index == OFFERS_URLS[-i - 1]['index']: # decrementing from the end
                            return True
                    return False # not found if reached this return
                
                if not foundAmongSavedIndexes():
                    OFFERS_URLS.append({'index':index, 'url':href})
            
            except:
                pass #if url not found
        # if len(OFFERS_URLS) >=1:
        #     print(int(OFFERS_URLS[-1]['index']) - int(OFFERS_URLS[0]['index']) + 1, len(OFFERS_URLS))
        #     print('first and last OFFERS_URLS: ', OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
    except Exception as exception:
        print(exception)
        return

def fetchAllOffersUrls():
    noNewResultsCounter = 0
    lastSeenIndex = 0
    if not anyOffersOnTheList():
        return
    
    DRIVER.execute_script("window.scrollTo(0, 0);") # scroll to the top
    time.sleep(0.5)

    while True: # endless loop only ending at return
        print('fetchAllOffersUrls while True:')
        print(len(OFFERS_URLS))
        fetchCurrentlyVisibleOffersUrls() # updates OFFERS_URLS
        print(len(OFFERS_URLS))

        if len(OFFERS_URLS) == 0: # should have results already from the above function execution
            noNewResultsCounter += 1
        elif len(OFFERS_URLS) > 0:
            # no new offer index found
            if (lastSeenIndex == OFFERS_URLS[-1]['index']):
                noNewResultsCounter += 1
                DRIVER.execute_script("window.scrollBy(0, -2*innerHeight);") # for some reason scrolling up helps this fucking site to load the bottom
                time.sleep(1)
                DRIVER.execute_script("window.scrollBy(0, 3*innerHeight);") # scroll to the bottom
                print('noNewResults')
                # print(OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
            # if new offer index found reset the counter
            else: 
                noNewResultsCounter = 0

        # CHECK IF READY TO TERMINATE
        if noNewResultsCounter >= 5: # or int(lastSeenIndex) >= 10: # END IF NO NEW RESULTS FEW TIMES
            print('RETURN')
            #print(int(OFFERS_URLS[-1]['index']) - int(OFFERS_URLS[0]['index']) + 1, len(OFFERS_URLS))
            print('first and last OFFERS_URLS: ', OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
            return
        
        #UPDATE LAST INDEX
        lastSeenIndex = OFFERS_URLS[-1]['index']
        time.sleep(0.5)
        DRIVER.execute_script("window.scrollBy(0, innerHeight);")

fetchAllOffersUrls()
# fetchCurrentlyVisibleOffersUrls()

### Analyse offer functions - JUSTJOIN

In [None]:
# JUST FOR IMPORTING MODULE IN THE NOTEBOOK
import sys
import os
# Get the current working directory (where the notebook is located)
current_directory = os.getcwd()
# Go one level up (to the parent directory)
parent_directory = os.path.dirname(current_directory)
# Add directory to sys.path
sys.path.append(parent_directory)
# Now you can import module
import settings

def getOfferDetails():
    # BASIC PARAMETERS WHICH SHOULD ALWAYS BE NOT EMPTY ON THE SITE
    try:
        offerContent = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-tnvghs')
        topContainer = offerContent.find_element(By.CSS_SELECTOR, 'div')
        topDiv = topContainer.find_element(By.XPATH, ".//*[contains(@class, 'css-10x887j')]") # .// = as deep as necessary
    except Exception as exception:
        # print(exception)
        # no point of continuing, but has to return a dictionary of nones
        return {'datetimeLast':None, 'datetimeFirst':None, 'url':None, 'title':None, 'salaryAndContract':None, 'salaryMin':None, 'salaryMax':None, 'employer':None, 'workModes':None, 'positionLevels':None, 'location':None, 'techstackExpected':None, 'techstackOptional':None, 'responsibilities':None, 'requirements':None, 'optionalRequirements':None, 'fullDescription':None}

    try:
        jobTitle = topDiv.find_element(By.CSS_SELECTOR, 'h1').text
    except:
        jobTitle = None
    try:
        employerAndLocationDiv = topDiv.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-yd5zxy')
        # FOR SOME REASON JUST THIS ONE FIELD OCCASIONALLY HAS STALE ELEMENT ERROR, and it's not due to employerAndLocationDiv as its being used successfully in location scraping
        # The StaleElementReferenceException happens because the element is no longer attached to the DOM, which can occur if the page is updated, refreshed, or if JavaScript modifies the page
        employer = employerAndLocationDiv.find_element(By.XPATH, './/h2').text # look for h2 as deep as necessary

        # print('\n' +employer) # name="multilocation_button"
    except:
        employer = None
        # import traceback
        # traceback.print_exc()

    try:
        location = employerAndLocationDiv.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-mswf74')[1].text # first one is employer
        location = re.sub(r'\+[0-9]+$', '', location) #remove '+x' where x is int
    except:
        location = None
    #try clicking for more locations
    try:
        locationButton = employerAndLocationDiv.find_element("xpath", '//*[@name="multilocation_button"]')
        locationButton.click()
        locationsMenu = offerContent.find_element("xpath", '//ul[@role="menu"]')
        # locationsMenu = locationsMenu.find_elements(By.CSS_SELECTOR, 'li')
        if location == None:
            location = ''
        location += '\n' + locationsMenu.text # TEXT EMPTY WHEN MINIMIZED!
    except Exception as exception:
        pass
    # print(location)

    #SALARY
    try:
        salaryAndContract = topContainer.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-1km0bek').text
    except:
        salaryAndContract = None

    salaryMinAndMax = [None, None] # Nones as these are INTs in DB
    if salaryAndContract != None:
        try: #to recalculate salary to [PLN/month net]
            hoursPerMonthInFullTimeJob = 168
            minAndMaxLine = salaryAndContract.splitlines()[0] # There could be multiple salaries depending on contract type though. It will be in salaryAndContract anyway
            secondLine = salaryAndContract.splitlines()[1]
            splitValues = re.split(r'-', minAndMaxLine) # split on dash for min and max

            for i in range(len(splitValues)):
                splitValues[i] = splitValues[i].replace(" ", "") # remove spaces
                splitValues[i] = re.sub(r",\d{1,2}", '', splitValues[i]) # removes , and /d{1 to 2 occurrences}  (needed when salary as 123,45)
                salaryMinAndMax[i] = re.search(r"\d+", splitValues[i]).group() # r = raw, \d+ = at least 1 digit, group() contains results
            # gross -> net
            if re.findall("brutto", secondLine) or re.findall("gross", secondLine):
                salaryMinAndMax = [(float(elmnt) * settings.GROSS_TO_NET_MULTIPLIER) for elmnt in salaryMinAndMax]
            # year -> month
            if re.findall("year", secondLine) or re.findall("rok", secondLine): 
                salaryMinAndMax = [(float(elmnt)/12) for elmnt in salaryMinAndMax] #possible input float/str
            # day -> month
            if re.findall("day", secondLine) or re.findall("dzień", secondLine): 
                salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob/8) for elmnt in salaryMinAndMax] #possible input float/str
            # hr -> month
            if re.findall("hour", secondLine) or re.findall(r"/h", secondLine) or re.findall("godz", secondLine): 
                salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str

            salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
        except Exception as exception:
            pass    # salaryMinAndMax = [None, None]
    # print(salaryMinAndMax)
        
    workModes = None
    positionLevels = None

    try:
        # MuiBox-root css-ktfb40
        fourRectanglesContainer = offerContent.find_elements(By.XPATH, "./div")[1] # only child divs, not grandchild or further - 1 level down
        # print(fourRectanglesContainer.text)
        fourRectangles = fourRectanglesContainer.find_elements(By.XPATH, "./div")

        for i in range(len(fourRectangles)):
            fourRectangles[i] = fourRectangles[i].find_elements(By.XPATH, "./div")[1] # second child div (not grandchild or further)
            fourRectangles[i] = fourRectangles[i].find_elements(By.XPATH, "./div")[1].text
            # print(fourRectangles[i])
        if salaryAndContract == None:
            salaryAndContract = ''
        salaryAndContract += '\n' + fourRectangles[0] + ' | ' + fourRectangles[2]
        positionLevels = fourRectangles[1]
        workModes = fourRectangles[3]
    except Exception as exception:
        print(exception)
        pass
    # print(salaryAndContract)
    # print(workModes, positionLevels + '\n')

    #TECHSTACK
    techstackExpected, techstackOptional = None, None
    try:
        # techstackDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[0] # changed 04.2025
        # techstackDiv = techstackDiv.find_element(By.CSS_SELECTOR, 'div')
        techstackDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiStack-root.css-6r2fzw')[0]

        technologies = techstackDiv.find_elements(By.XPATH, './/h4') # look for h4 in all children elements
        levels = techstackDiv.find_elements(By.XPATH, './/span') # look for h4 in all children elements
        for i in range(len(technologies)):
            techWithLvl = technologies[i].text + ' - ' + levels[i].text
            # print(techWithLvl)
            if levels[i].text == 'Nice To Have': # or levels[i].text == 'Junior'
                if techstackOptional == None:
                    techstackOptional = ''
                techstackOptional += '\n' + techWithLvl
            else: # -(nice to have)/junior/regular/advanced/master
                if techstackExpected == None:
                    techstackExpected = ''
                techstackExpected += '\n' + techWithLvl
        if techstackOptional != None:
            techstackOptional = re.sub(r"^\n", '', techstackOptional)
        if techstackExpected != None:
            techstackExpected = re.sub(r"^\n", '', techstackExpected)
    except Exception as exception:
        print(exception)
        pass # leave empty strs
    # print(techstackExpected + '\n\n' + techstackOptional)

    fullDescription = None
    responsibilities, requirements, optionalRequirements = None, None, None

    optionalRequirementsKeywords = ['nice to', 'optional', 'ideal', 'preferr', 'asset', 'appreciat', 'atut', 'dodatk', 'mile widzi']
    requirementsKeywords = ['require', 'expect', 'skill', 'look', 'qualifications', 'must', 'competen', 'wymaga', 'oczek', 'umiejętn', 'aplikuj, jeśli', 'potrzeb', 'szukamy', 'kompeten']
    responsibilitiesKeywords = ['responsib', 'task', 'role', 'project', 'obowiązk', 'zadani', 'projek']
    whatTheyOfferKeywords = ['offer', 'benefit' 'oferuj', 'oferow']

    allKeywordsDict = {'optionalRequirementsKeywords':optionalRequirementsKeywords, 'requirementsKeywords':requirementsKeywords, 'responsibilitiesKeywords':responsibilitiesKeywords}

    try:
        # descriptionDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[1] # changed 31.03.20205 (not sure when on jj.it)
        # descriptionDiv = descriptionDiv.find_elements(By.XPATH, "./div")[1] # second child div
        descriptionDiv = offerContent.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-rcazos')
        
        # Remove empty lines (including those with spaces)
        fullDescription = re.sub(r'^\s*\n', '', descriptionDiv.text, flags=re.MULTILINE)
        # print(fullDescription)

        def splitTextByKeywords(text, keywords):
            lines = text.split("\n")  # Split text into lines
            keywordIndices = [i for i, line in enumerate(lines) if any(keyword.lower() in line.lower() for keyword in keywords)]
            # If no keywords found, return the original text as one paragraph
            if not keywordIndices:
                return [text]
            
            paragraphs = []
            startIndex = 0
            for keywordIndex in keywordIndices:
                # if startIndex != keywordIndex:
                paragraphs.append("\n".join(lines[startIndex:keywordIndex]).strip())  # Capture paragraph before keyword
                startIndex = keywordIndex  # Update start for next section
            paragraphs.append("\n".join(lines[startIndex:]).strip())  # Capture the last paragraph
            return paragraphs
        
        keywords = optionalRequirementsKeywords + requirementsKeywords + responsibilitiesKeywords + whatTheyOfferKeywords # just concat keywords lists as they will be assigned to a category later
        paragraphs = splitTextByKeywords(fullDescription, keywords)

        for paragraph in paragraphs:
            if not paragraph or re.search(r"^\s*$", paragraph): # \s matches Unicode whitespace characters. This includes [ \t\n\r\f\v] and more
                continue # don't try to analyze an empty sting, go with next loop iteration
            # look for keywords in the 1st line of text
            header = paragraph.splitlines()[0] # first line
            # print('=====================')
            # print(header)
            # print(paragraph)
            for keywordsCategory in allKeywordsDict.keys():
                # print(keywordsCategory)
                for keyword in allKeywordsDict[keywordsCategory]:
                    # if keyword found in header
                    # if re.search(rf'\b{re.escape(keyword)}\b', header, re.IGNORECASE): # \b = boundaries - matches whole words, regardless of punctuation or position in the string # escapes = escape regex reserved symbols
                    if re.search(rf'.*{re.escape(keyword)}.*', header, re.IGNORECASE): # .* = any symbol any number of times
                        # print('found ' + keyword)
                        if keywordsCategory =='optionalRequirementsKeywords': # check this first as it's more specific than requirements and contains similar keywords
                            if optionalRequirements == None:
                                optionalRequirements = ''
                            optionalRequirements += paragraph

                        elif keywordsCategory =='requirementsKeywords':
                            if requirements == None:
                                requirements = ''
                            requirements += paragraph

                        elif keywordsCategory =='responsibilitiesKeywords':
                            if responsibilities == None:
                                responsibilities = ''
                            responsibilities += paragraph

    except Exception as exception:
        # print(exception)
        pass

    # print('\n\n'+ responsibilities +'\n\n'+ requirements +'\n\n'+ optionalRequirements)
    datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    return {'datetimeLast':datetimeNow, 'datetimeFirst':datetimeNow, 'url':DRIVER.current_url, 'title':jobTitle, 'salaryAndContract':salaryAndContract, 'salaryMin':salaryMinAndMax[0], 'salaryMax':salaryMinAndMax[1], 'employer':employer, 'workModes':workModes, 'positionLevels':positionLevels, 'location':location, 'techstackExpected':techstackExpected, 'techstackOptional':techstackOptional, 'responsibilities':responsibilities, 'requirements':requirements, 'optionalRequirements':optionalRequirements, 'fullDescription':fullDescription}

# testing below
DRIVER.get('https://justjoin.it/job-offer/itlt-senior-tech-lead-python-developer-warszawa-python')

# DRIVER.switch_to.window(DRIVER.window_handles[-1])
getOfferDetails()

### Analyse offer functions - THEPROTOCOL

In [None]:
# JUST FOR IMPORTING MODULE IN THE NOTEBOOK
import sys
import os
# Get the current working directory (where the notebook is located)
current_directory = os.getcwd()
# Go one level up (to the parent directory)
parent_directory = os.path.dirname(current_directory)
# Add directory to sys.path
sys.path.append(parent_directory)
# Now you can import module
import settings

def getOfferDetails():
    #JOB TITLE
    try:
        jobTitle = DRIVER.find_element(By.XPATH, '//*[@data-test="text-offerTitle"]') # this element should always exist
        jobTitle = jobTitle.text
    except:
        jobTitle = None
    
    #SALARY
    try:
        salaryContainer = DRIVER.find_element(By.XPATH, '//*[@data-test="section-contract"]') # this element should always exist
        salaryAndContract = salaryContainer.text
        # print(salaryAndContract)
        # print(salaryAndContract  + '\n')
    except:
        salaryAndContract = None
    
    salaryMinAndMax = [None, None] # Nones as these are INTs in DB
    if salaryAndContract != None:
        try: #to recalculate salary to [PLN/month net] #PLN=only unit on protocol?
            hoursPerMonthInFullTimeJob = 168
            lines = salaryAndContract.splitlines()
            if len(lines) >= 3: #should be 2-3 tho
                lines[0] = lines[0].replace(" ", "") #remove spaces
                lines[0] = re.sub(r",\d{1,2}", '', lines[0]) #removes dash and /d x(1-2)  (needed when salary as 123,45)
                salaryMinAndMax = re.findall(r"\d+", lines[0]) #r = raw
                # print(salaryMinAndMax.split(',', 1)[0])
                # salaryUnit = re.findall(r"[^\d–-]", lines[0]) #[exclude digits and –/-]
                # salaryUnit = ''.join(salaryUnit) #join list elements
                if re.findall("brutto", lines[1]) or re.findall("gross", lines[1]): # gross -> net
                    salaryMinAndMax = [(float(elmnt) * settings.GROSS_TO_NET_MULTIPLIER) for elmnt in salaryMinAndMax]
                    # print(salaryMinAndMax)
                if re.findall("godz", lines[1]) or re.findall("hr.", lines[1]): # hr -> month
                    salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str

                if salaryMinAndMax[1] == None: # some offers provide just 1 extremum
                    salaryMinAndMax[1] = salaryMinAndMax[0]
                salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
        except:
            pass    # salaryMinAndMax = [None, None]

    # EMPLOYER
    try:
        employerElement = DRIVER.find_element("xpath", '//*[@data-test="anchor-company-link"]') # this element should always exist
        employer = employerElement.text # + ' ' + employerElement.get_property("href")
        employer = re.sub('company: |firma: ', '', employer, flags=re.IGNORECASE).strip()

    except:
        employer= None
    # print(employer  + '\n')
    
    # WORKFROM, EXP, VALIDTO, LOCATION - "PARAMETERS"
    workModes, positionLevels, location = None, None, None
    try:
        parametersContainer = DRIVER.find_element(By.CLASS_NAME, "c21kfgf")
        parameters = parametersContainer.find_elements(By.CLASS_NAME, "s1bu9jax")
        for param in parameters:
            paramType = param.get_attribute("data-test") #element description
            match paramType:
                case "section-workModes":
                    workModes = param.text
                case "section-positionLevels":
                    positionLevels = param.text
                # case "section-offerValidTo":
                #     offerValidTo = param.text
                case "section-workplace":
                    location = param.text
                    try: #to find and click 'more locations' button then fetch what's inside
                        moreLocations = DRIVER.find_element("xpath", '//*[@data-test="button-locationPicker"]')
                        moreLocations.click()
                        locations = moreLocations.find_element("xpath", '//*[@data-test="modal-locations"]')
                        location = locations.text
                    except:
                        pass #leave location as it was
        # print(workModes + '\n\n' + positionLevels + '\n\n' + '\n\n' +  location + '\n')
    except:
        pass # leave Nones
    
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if location == None: # checking location, as it's the toughest one to gather
        parametersContainer = DRIVER.find_element(By.CLASS_NAME, "m1vgkec8")
        parameters = parametersContainer.find_elements(By.CLASS_NAME, "b12rofz")
        # print(parameters)
        for param in parameters:
            paramType = param.text #element description
            match paramType:
                case thisCase if any(keyword in thisCase.lower() for keyword in ("mode", "tryb")):
                    lines = param.text.splitlines()
                    workModes = "".join(lines[1:])  # Join all lines except the first one (param description)
                case thisCase if any(keyword in thisCase.lower() for keyword in ("level", "poziom")):
                    lines = param.text.splitlines()
                    positionLevels = "".join(lines[1:])
                case _: # IF IT'S NOT MODE OR LEVEL, IT MUST BE LOCATION DIV
                    location = param.text # fine for a single location
                    # remove description keyword
                    location = re.sub('location:|lokalizacja:', '', location, flags=re.IGNORECASE).strip()
                    # TRY CLICKING 'MORE' BUTTON
                    try: #to find and click 'more locations' button then fetch what's inside
                        moreLocations = param.find_element("xpath", '//button[@class="m8ercsp"]')
                        moreLocations.click()
                        locations = moreLocations.find_element("xpath", '//*[@class="mtlwq3f"]')
                        location = locations.text # overwrites a single one
                        location = re.sub('view on map', '', location, flags=re.IGNORECASE).strip()
                    except:
                        pass # leave location as it was


    #TECHSTACK
    techstackExpected, techstackOptional = None, None
    try:
        descriptionsContainer = DRIVER.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION')
        techstack = descriptionsContainer.find_elements(By.CLASS_NAME, "c1fj2x2p")
        for group in techstack:
            if re.search('expected|wymagane', group.text.lower()):
                lines = group.text.splitlines()
                techstackExpected = "\n".join(lines[1:]) # join lines except first one
            if re.search('optional|mile widziane', group.text.lower()):
                lines = group.text.splitlines()
                techstackOptional = "\n".join(lines[1:])
        # print(str(techstackExpected) + '\n\n' + str(techstackOptional) + '\n')
    except:
        pass # leave Nones
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if techstackExpected == None:
        try:
            technologiesContainer = DRIVER.find_element("xpath", '//*[@data-test="section-technologies"]')
            techstack = technologiesContainer.find_elements("xpath", './div') # divs 1 level down
            for group in techstack:
                if re.search('expected|wymagane', group.text.lower()):
                    lines = group.text.splitlines()
                    techstackExpected = "\n".join(lines[1:]) # join lines except first one
                if re.search('optional|mile widziane', group.text.lower()):
                    lines = group.text.splitlines()
                    techstackOptional = "\n".join(lines[1:])
            # print(str(techstackExpected) + '\n\n' + str(techstackOptional) + '\n')
        except:
            pass # leave Nones


    #RESPONSIBILITIES
    try:
        try:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]/ul').text #/only ul elements
        except:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]').text #/if it's a single entry
    except:
        responsibilities= None
        # print('RESPONSIBILITIES:\n' + str(responsibilities) + '\n' + driver.current_url)
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if responsibilities == None:
        try:
            responsibilities = DRIVER.find_element("xpath", '//*[@data-test="section-responsibilities"]').text
            # responsibilities = "\n".join(responsibilities.split('\n')[1:]) # remove 1st line
        except:
            responsibilities= None
            # print('RESPONSIBILITIES:\n' + str(responsibilities))

    #REQUIREMENTS
    try:
        try:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]/ul').text
        except:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]').text #/if it's a single entry
    except:
        requirements= None
        # print('REQUIREMENTS:\n' + str(requirements) + '\n' + driver.current_url)
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if requirements == None:
        try:
            requirements = DRIVER.find_element("xpath", '//*[@data-test="section-requirements-expected"]').text
            # requirements = "\n".join(requirements.split('\n')[1:]) # remove 1st line
        except:
            requirements= None
            # print('REQUIREMENTS:\n' + str(requirements))


    #OPTIONAL REQUIREMENTS
    try:
        optionalRequirementsContainer = descriptionsContainer.find_elements("xpath", '//*[@data-test="section-requirements-optional"]/li')
        if len(optionalRequirementsContainer) > 0:
            optionalRequirements = ''
            for optionalRequirement in optionalRequirementsContainer:
                optionalRequirements += optionalRequirement.text + '\n'
        elif len(optionalRequirementsContainer) == 0:
            try:
                optionalRequirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
            except:
                optionalRequirements= None
                # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)        
    except:
        optionalRequirements= None
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if optionalRequirements == None:
        try:
            optionalRequirements = DRIVER.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
        except:
            optionalRequirements = None
    # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)

    datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # FULL DESCRIPTION
    fullDescription = None
    try:
        fullDescription = DRIVER.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION').text
    except:
        fullDescription = None
    # IF STILL NOT FOUND, TRY SEARCHING NEW HTML ELEMENTS (04.2025)
    if fullDescription == None:   
        try:
            fullDescription = DRIVER.find_element(By.CSS_SELECTOR, '#REQUIREMENTS').text
        except:
            fullDescription = None    

    return {'datetimeLast':datetimeNow, 'datetimeFirst':datetimeNow, 'url':DRIVER.current_url, 'title':jobTitle, 'salaryAndContract':salaryAndContract, 'salaryMin':salaryMinAndMax[0], 'salaryMax':salaryMinAndMax[1], 'employer':employer, 'workModes':workModes, 'positionLevels':positionLevels, 'location':location, 'techstackExpected':techstackExpected, 'techstackOptional':techstackOptional, 'responsibilities':responsibilities, 'requirements':requirements, 'optionalRequirements':optionalRequirements, 'fullDescription':fullDescription}

# DRIVER.get('https://theprotocol.it/szczegoly/praca/programista---programistka-python-kielce,oferta,edf50000-0713-3634-26bd-08dd72be8965')
# DRIVER.get('https://theprotocol.it/szczegoly/praca/programista---programistka-python-kielce,oferta,edf50000-0713-3634-26bd-08dd72be8965?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')
# DRIVER.get('https://theprotocol.it/szczegoly/praca/python-ai-developer-katowice,oferta,07d70000-4902-b661-c56b-08dd71de3699?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')

# DRIVER.switch_to.window(DRIVER.window_handles[-1])
getOfferDetails()

## testing element search

In [None]:
# TESTING ELEMENT SEARCH

# DRIVER.get('https://theprotocol.it/szczegoly/praca/expert-haskell-developer-warszawa,oferta,edf50000-0713-3634-b34f-08dd7281c08f?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')
# DRIVER.get('https://theprotocol.it/szczegoly/praca/java-developer-in-test-suite-project-smartthings-warszawa-plac-europejski-1,oferta,07d70000-4902-b661-242c-08dd72871111?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')
# DRIVER.get('https://theprotocol.it/szczegoly/praca/python-ai-developer-katowice,oferta,07d70000-4902-b661-c56b-08dd71de3699?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')
DRIVER.get('https://theprotocol.it/szczegoly/praca/programista---programistka-python-kielce,oferta,edf50000-0713-3634-26bd-08dd72be8965?s=8231999580&searchId=9171f050-10af-11f0-95fd-733af48622c0')


fullDescription = DRIVER.find_element(By.CSS_SELECTOR, '#REQUIREMENTS').text

print(fullDescription)


### Scraping to database

In [None]:
from databaseFunctions import Database
from settings import DATABASE_COLUMNS
# import numpy as np

def scrapToDatabase():
    # timeDeltas = []
    inserts = 0
    updates = 0
    print(Database.countAllRecords() + ' records before run')
    # for i in range (0,2):
    for i in range (len(OFFERS_URLS)):
        DRIVER.get(OFFERS_URLS[i]['url'])
        if not offerNotFound():
            # LOOK FOR COMMON KEYS AS getOfferDetails() can return more keys than custom shortened DB has columns
            offerDetailsDict = getOfferDetails()
            # a dictionary containing only the keys appearing in both dictionaries
            commonKeysDict = {key: offerDetailsDict[key] for key in DATABASE_COLUMNS if key in offerDetailsDict}
            
            # # before = time.time()
            if Database.recordFound(DRIVER.current_url):
                Database.updateDatetimeLast(DRIVER.current_url)
                # print(driver.current_url)
                updates += 1
            else:
                Database.insertRecord(commonKeysDict) # insert into database
                inserts += 1
                # print('insert')
            # timeDeltas.append(time.time() - before)
            #ending here and starting in an above for/zip loop it takes ~(1/100)s - good enough
            print (str(i+1) + '/' + str(len(OFFERS_URLS)) + ' done')
        else:
            print('OFFER NOT FOUND: ' +  DRIVER.current_url)
        # time.sleep(random.uniform(0.35,0.85)) # Humanize requests frequency - justjoin slow already
    # print(np.mean(timeDeltas))
    print(str(inserts) + ' inserts | ' + str(updates) + ' updates')

scrapToDatabase()

### Test code

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random, re, datetime
import pandas as pd
pd.options.mode.copy_on_write = True # recommended - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# ChromeDriver should match browser version. If outdated download from:
# https://googlechromelabs.github.io/chrome-for-testing/

def setCookiesFromJson():  
    try:
        DRIVER.get(BASE_URL) #RUN BROWSER
        currentUrlDomain = DRIVER.current_url
        currentUrlDomain = re.search(r'^https?://([^/]+)', currentUrlDomain)
        currentUrlDomain = currentUrlDomain.group(1)  
        currentUrlDomain = re.sub(r'^www\.', '', currentUrlDomain)
        currentUrlDomain = re.sub(r'^\.', '', currentUrlDomain)
        # print(currentUrlDomain)
        with open('cookies.json', 'r', newline='') as inputdata:
            cookies = json.load(inputdata)
            cookiesAdded = 0
            for cookie in cookies: #works only after driver.get
                if re.match(r".?"+currentUrlDomain, cookie['domain']): # can only add cookies for current domain
                    DRIVER.add_cookie(cookie)
                    cookiesAdded += 1
            if cookiesAdded > 0:
                DRIVER.refresh() # to load cookies
                return {'success':True, 'functionDone':True, 'message':'cookies for ' + currentUrlDomain + ' successfully set'}
            elif (cookiesAdded == 0):
                return {'success':False, 'functionDone':True, 'message':'no cookies for ' + currentUrlDomain + ' found in cookies.json'}
    except Exception as exception:
        return {'success':False, 'functionDone':True, 'message':str(exception)} # 'functionDone':True because it's not necessary

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()

chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("window-size=800,1000")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
BASE_URL = "https://justjoin.it/job-offer/relativity-senior-software-engin"

# DRIVER = webdriver.Chrome(service=service, options=chrome_options)
# DRIVER.get(BASE_URL)
# setCookiesFromJson()

# Find elements containing specific text

def offerNotFound():
    try:
        # Sorry, we cannot display this page. It is possible that its address has changed or it has been removed.
        notFoundMsg = 'we cannot display this page'
        notFoundMsgDivs = DRIVER.find_elements(By.CLASS_NAME, "css-czlivx") # only 1 element of that class found tho
        for div in notFoundMsgDivs:
            # Case-insensitive check
            if notFoundMsg.lower() in div.text.lower():
                print(div.text)
                return True
        return False
    except:
        return False
    
offerNotFound()

In [144]:
import re

query = """SELECT * FRoM "tab-L-e_1" Where []  _, != ()<> :'.,;ąśż ORDER BY"""


query = """
WITH top_users AS (
    SELECT id, name, score
    FROM users
    WHERE score > 80
)
SELECT *
FROM top_users
ORDER BY score DESC
"""

# text = 'asd'

# pick a word after ' FROM ' and the rest after table name
dividedQuery = re.search(r'SELECT.*\s+FROM\s+(?:["\[\']?)([\w-]+)(?:["\]\']?)\s+(.+)$', query, flags=re.IGNORECASE)
tableName = dividedQuery.group(1) # None if not found
queryAfterTableName = dividedQuery.group(2)
queryPlot = "SELECT datetimeFirst, datetimeLast, title, salaryMin, salaryMax FROM " + tableName + " " + queryAfterTableName

print(tableName)
print(queryAfterTableName)
print(queryPlot)

top_users
ORDER BY score DESC
SELECT datetimeFirst, datetimeLast, title, salaryMin, salaryMax FROM top_users ORDER BY score DESC


In [14]:
import re
query = """SELECT salaryMax FROM test1
WHERE salaryMax > 20000 AND 1=1 AND 2=2
AND 1=1
"""

dividedQuery = re.search(r'SELECT.*\s+FROM\s+(?:["\[\']?)([\w-]+)(?:["\]\']?)\s*(.*)', query, flags=re.IGNORECASE|re.DOTALL) # re.DOTALL to match newline
# print(dividedQuery.group(0))
print(dividedQuery.group(1))
print(dividedQuery.group(2))

test1
WHERE salaryMax > 20000 AND 1=1 AND 2=2
AND 1=1

