### Open a browser and set the cookies from a JSON file

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random, re, datetime
import pandas as pd
pd.options.mode.copy_on_write = True # recommended - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# ChromeDriver should match browser version. If outdated download from:
# https://googlechromelabs.github.io/chrome-for-testing/

def setCookiesFromJson():  
    try:
        DRIVER.get(BASE_URL) #RUN BROWSER
        currentUrlDomain = DRIVER.current_url
        currentUrlDomain = re.search(r'^https?://([^/]+)', currentUrlDomain)
        currentUrlDomain = currentUrlDomain.group(1)  
        currentUrlDomain = re.sub(r'^www\.', '', currentUrlDomain)
        currentUrlDomain = re.sub(r'^\.', '', currentUrlDomain)
        # print(currentUrlDomain)
        with open('cookies.json', 'r', newline='') as inputdata:
            cookies = json.load(inputdata)
            cookiesAdded = 0
            for cookie in cookies: #works only after driver.get
                if re.match(r".?"+currentUrlDomain, cookie['domain']): # can only add cookies for current domain
                    DRIVER.add_cookie(cookie)
                    cookiesAdded += 1
            if cookiesAdded > 0:
                DRIVER.refresh() # to load cookies
                return {'success':True, 'functionDone':True, 'message':'cookies for ' + currentUrlDomain + ' successfully set'}
            elif (cookiesAdded == 0):
                return {'success':False, 'functionDone':True, 'message':'no cookies for ' + currentUrlDomain + ' found in cookies.json'}
    except Exception as exception:
        return {'success':False, 'functionDone':True, 'message':str(exception)} # 'functionDone':True because it's not necessary

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()

chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("window-size=800,1000")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
DRIVER = webdriver.Chrome(service=service, options=chrome_options)

BASE_URL = "https://justjoin.it/job-offers/bialystok"
DRIVER.get(BASE_URL)
setCookiesFromJson()

### Fetch the URLs from all the pages

In [None]:
# def getLastOfferIndex():
#     try:
#         DRIVER.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll to the bottom
#         offersList = DRIVER.find_element(By.ID, 'up-offers-list')
#         offers = offersList.find_elements(By.XPATH, '//li[@data-index]') # amount depends on screen height 
#         lastIndex = offers[-1].get_attribute('data-index')
#         return lastIndex
#     except Exception as e:
#         print(e)
#         return

def offerNotFound():
    try:
        offerContent = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-tnvghs')
        topContainer = offerContent.find_element(By.CSS_SELECTOR, 'div') # 1st div
        # NEED TO FIND 'OFFER NOT FOUND MSG AND CHECK WHICH DIVS DOES IT HAVE
        # topDiv = topContainer.find_element(By.XPATH, ".//*[contains(@class, 'css-10x887j')]")
        return False # if topDiv found, offer is there
    except:
        return True

def anyOffersOnTheList():
    try:
        # offersList = DRIVER.find_element(By.ID, 'up-offers-list') # changed back to virtuoso ~15.01.2025 xD
        # offers = offersList.find_elements(By.XPATH, '//li[@data-index]') # changed back to virtuoso ~15.01.2025 xD

        offersList = DRIVER.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]') # changed to up-offers-list ~25.12.2024
        offers = offersList.find_elements(By.XPATH, 'div[@data-index]') # virtuoso approach
        # print(len(offers))
        if len(offers) > 0:
            return True
        else:
            return False
    except Exception as e:
        # print(e)
        return False

OFFERS_URLS = []

def fetchCurrentlyVisibleOffersUrls(): # just the ones currently rendered in browser
    try:
        offersList = DRIVER.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]') # changed to up-offers-list ~25.12.2024
        offers = offersList.find_elements(By.XPATH, 'div[@data-index]') # amount depends on screen height 
        for offer in offers: # ever-loading div among them
            try:
                index = offer.get_attribute('data-index')
                href = offer.find_element(By.XPATH, ".//div/div/a").get_property("href")

                def foundAmongSavedIndexes():
                    if len(OFFERS_URLS) == 0:
                        return False # no offers
                    for i in range (len(OFFERS_URLS[-30:])): # 30 last offers (or less if len < 30)
                        if index == OFFERS_URLS[-i - 1]['index']: # decrementing from the end
                            return True
                    return False # not found if reached this return
                
                if not foundAmongSavedIndexes():
                    OFFERS_URLS.append({'index':index, 'url':href})
            
            except:
                pass #if url not found
        # if len(OFFERS_URLS) >=1:
        #     print(int(OFFERS_URLS[-1]['index']) - int(OFFERS_URLS[0]['index']) + 1, len(OFFERS_URLS))
        #     print('first and last OFFERS_URLS: ', OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
    except Exception as exception:
        print(exception)
        return

def fetchAllOffersUrls():
    noNewResultsCounter = 0
    lastSeenIndex = 0
    if not anyOffersOnTheList():
        return
    
    DRIVER.execute_script("window.scrollTo(0, 0);") # scroll to the top
    time.sleep(0.5)

    while True: # endless loop only ending at return
        print('fetchAllOffersUrls while True:')
        print(len(OFFERS_URLS))
        fetchCurrentlyVisibleOffersUrls() # updates OFFERS_URLS
        print(len(OFFERS_URLS))

        if len(OFFERS_URLS) == 0: # should have results already from the above function execution
            noNewResultsCounter += 1
        elif len(OFFERS_URLS) > 0:
            # no new offer index found
            if (lastSeenIndex == OFFERS_URLS[-1]['index']):
                noNewResultsCounter += 1
                DRIVER.execute_script("window.scrollBy(0, -2*innerHeight);") # for some reason scrolling up helps this fucking site to load the bottom
                time.sleep(1)
                DRIVER.execute_script("window.scrollBy(0, 3*innerHeight);") # scroll to the bottom
                print('noNewResults')
                # print(OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
            # if new offer index found reset the counter
            else: 
                noNewResultsCounter = 0

        # CHECK IF READY TO TERMINATE
        if noNewResultsCounter >= 5: # or int(lastSeenIndex) >= 10: # END IF NO NEW RESULTS FEW TIMES
            print('RETURN')
            #print(int(OFFERS_URLS[-1]['index']) - int(OFFERS_URLS[0]['index']) + 1, len(OFFERS_URLS))
            print('first and last OFFERS_URLS: ', OFFERS_URLS[0]['index'], OFFERS_URLS[-1]['index'])
            return
        
        #UPDATE LAST INDEX
        lastSeenIndex = OFFERS_URLS[-1]['index']
        time.sleep(0.5)
        DRIVER.execute_script("window.scrollBy(0, innerHeight);")

fetchAllOffersUrls()
# fetchCurrentlyVisibleOffersUrls()

### Analyse offer functions

In [3]:
def getOfferDetails():
    # BASIC PARAMETERS WHICH SHOULD ALWAYS BE NOT EMPTY ON THE SITE
    try:
        offerContent = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-tnvghs')
        topContainer = offerContent.find_element(By.CSS_SELECTOR, 'div')
        topDiv = topContainer.find_element(By.XPATH, ".//*[contains(@class, 'css-10x887j')]") # .// = as deep as necessary
    except Exception as exception:
        # print(exception)
        return # no point of continuing
    try:
        jobTitle = topDiv.find_element(By.CSS_SELECTOR, 'h1').text
        # print(jobTitle)
        employerAndLocationDiv = topDiv.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-yd5zxy') 
        employer = employerAndLocationDiv.find_element(By.XPATH, './/h2').text # look for h2 as deep as necessary
        # print(employer) # name="multilocation_button"
    except:
        jobTitle, employer = '', ''

    try:
        location = employerAndLocationDiv.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-mswf74')[1].text # first one is employer
        location = re.sub(r'\+[0-9]+$', '', location) #remove '+x' where x is int
    except:
        location = ''
    #try clicking for more locations
    try:
        locationButton = employerAndLocationDiv.find_element("xpath", '//*[@name="multilocation_button"]')
        locationButton.click()
        locationsMenu = offerContent.find_element("xpath", '//ul[@role="menu"]')
        # locationsMenu = locationsMenu.find_elements(By.CSS_SELECTOR, 'li')
        location += '\n' + locationsMenu.text # TEXT EMPTY WHEN MINIMIZED!
    except Exception as exception:
        pass
    # print(location)

    #SALARY
    try:
        salaryAndContract = topContainer.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-1km0bek').text
    except:
        salaryAndContract= ''

    salaryMinAndMax = [None, None] # Nones as these are INTs in DB
    if salaryAndContract != '':
        try: #to recalculate salary to [PLN/month net]
            grossToNetMultiplier = 0.7
            hoursPerMonthInFullTimeJob = 168
            lines = salaryAndContract.splitlines()[0] # There could be multiple salaries depending on contract type though. It will be in salaryAndContract anyway
            splitValues = re.split(r'-', lines) # split on dash for min and max

            for i in range(len(splitValues)):
                splitValues[i] = splitValues[i].replace(" ", "") # remove spaces
                splitValues[i] = re.sub(r",\d{1,2}", '', splitValues[i]) # removes , and /d{1 to 2 occurrences}  (needed when salary as 123,45)
                salaryMinAndMax[i] = re.search(r"\d+", splitValues[i]).group() # r = raw, \d+ = at least 1 digit, group() contains results
                
            if re.findall("brutto", lines[1]) or re.findall("gross", lines[1]): # gross -> net
                salaryMinAndMax = [(float(elmnt) * grossToNetMultiplier) for elmnt in salaryMinAndMax]
                # print(salaryMinAndMax)
            if re.findall("godz", lines[1]) or re.findall("hr.", lines[1]): # hr -> month
                salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str

            salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
        except Exception as exception:
            pass    # salaryMinAndMax = [None, None]
    # print(salaryMinAndMax)

    # print(salaryAndContract)
    workModes = ''
    positionLevels = ''

    try:
        # MuiBox-root css-ktfb40
        fourRectanglesContainer = offerContent.find_elements(By.XPATH, "./div")[1] # only child divs, not grandchild or further - 1 level down
        # print(fourRectanglesContainer.text)
        fourRectangles = fourRectanglesContainer.find_elements(By.XPATH, "./div")

        for i in range(len(fourRectangles)):
            fourRectangles[i] = fourRectangles[i].find_elements(By.XPATH, "./div")[1] # second child div (not grandchild or further)
            fourRectangles[i] = fourRectangles[i].find_elements(By.XPATH, "./div")[1].text
            # print(fourRectangles[i])
        salaryAndContract += '\n' + fourRectangles[0] + ' | ' + fourRectangles[2]
        positionLevels = fourRectangles[1]
        workModes = fourRectangles[3]
    except Exception as exception:
        print(exception)
        pass
    # print(salaryAndContract)
    # print(workModes, positionLevels + '\n')

    #TECHSTACK
    techstackExpected, techstackOptional = '', ''
    try:
        techstackDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[0]
        techstackDiv = techstackDiv.find_element(By.CSS_SELECTOR, 'div')
        technologies = techstackDiv.find_elements(By.XPATH, './/h4') # look for h4 in all children elements
        levels = techstackDiv.find_elements(By.XPATH, './/span') # look for h4 in all children elements
        for i in range(len(technologies)):
            techWithLvl = technologies[i].text + ' - ' + levels[i].text
            # print(techWithLvl)
            if levels[i].text == 'Nice To Have': # or levels[i].text == 'Junior'
                techstackOptional += '\n' + techWithLvl
            else: # -(nice to have)/junior/regular/advanced/master
                techstackExpected += '\n' + techWithLvl

        techstackOptional = re.sub(r"^\n", '', techstackOptional)
        techstackExpected = re.sub(r"^\n", '', techstackExpected)
    except Exception as exception:
        print(exception)
        pass # leave empty strs
    # print(techstackExpected + '\n\n' + techstackOptional)

    # ==================== DO THIS NOW ====================
    offerValidTo = '' # REMOVE FROM DB?
    fullDescription = '' # ADD TO DB
    responsibilities, requirements, optionalRequirements, theyOffer = '', '', '', ''

    optionalRequirementsKeywords = ['nice to', 'optional', 'ideal', 'preferr', 'asset', 'appreciat', 'atut', 'dodatk', 'mile widzi']
    requirementsKeywords = ['require', 'expect', 'skill', 'look', 'qualifications', 'experience', 'must', 'competen', 'wymaga', 'oczek', 'umiejętn', 'aplikuj, jeśli', 'oczekuj', 'potrzeb', 'szukamy', 'kompeten']
    responsibilitiesKeywords = ['responsib', 'task', 'role', 'project', 'obowiązk', 'zadani', 'projek']
    whatTheyOfferKeywords = ['offer', 'benefit' 'oferuj', 'oferow']

    allKeywordsDict = {'optionalRequirementsKeywords':optionalRequirementsKeywords, 'requirementsKeywords':requirementsKeywords, 'responsibilitiesKeywords':responsibilitiesKeywords}

    try:
        descriptionDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[1]
        descriptionDiv = descriptionDiv.find_elements(By.XPATH, "./div")[1] # second child div
        fullDescription = descriptionDiv.text
        # print(r"{}".format(fullDescription))
        paragraphs = re.split(r"\n{2,}", fullDescription) # split on 2 or more newlines

        # USUALLY THE ABOVE SPLITTING IS ENOUGH, BUT IF NOT TRY FINDING PARAGRAPHS ANOTHER WAY
        if len(paragraphs) < 3: # 1 is possible minimum
            # print('LEN TOO SHORT: ' + str(len(paragraphs)))
            # paragraphs = re.split(r"(^|\n) +", fullDescription) # split on 1 or more spaces at the line beginning
            paragraphs = re.split(r"((^|\n) +|\n{2,})", fullDescription) # split on 1 or more spaces at the line beginning OR on 2 or more newlines
        # print(paragraphs)

        for paragraph in paragraphs:
            if not paragraph or re.search(r"^\s*$", paragraph): # \s matches Unicode whitespace characters. This includes [ \t\n\r\f\v] and more
                continue # don't try to analyze an empty sting, go with next loop iteration
            # look for keywords in the 1st line of text
            header = paragraph.splitlines()[0] # first line
            # print('=====================')
            # print(header)
            # print(paragraph)
            for keywordsCategory in allKeywordsDict.keys():
                # print(keywordsCategory)
                for keyword in allKeywordsDict[keywordsCategory]:
                    # if keyword found in header
                    # if re.search(rf'\b{re.escape(keyword)}\b', header, re.IGNORECASE): # \b = boundaries - matches whole words, regardless of punctuation or position in the string # escapes = escape regex reserved symbols
                    if re.search(rf'.*{re.escape(keyword)}.*', header, re.IGNORECASE): # .* = any symbol any number of times
                        # print('found ' + keyword)
                        if keywordsCategory =='optionalRequirementsKeywords': # check this first as it's more specific than requirements and contains similar keywords
                            optionalRequirements += paragraph
                        elif keywordsCategory =='requirementsKeywords':
                            requirements += paragraph
                        elif keywordsCategory =='responsibilitiesKeywords':
                            responsibilities += paragraph
                        elif keywordsCategory == 'whatTheyOfferKeywords':
                            whatTheyOfferKeywords += paragraph
    except Exception as exception:
        print(exception)
        pass

    # print('\n\n'+ responsibilities +'\n\n'+ requirements +'\n\n'+ optionalRequirements)
    datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    return {'datetimeLast':datetimeNow, 'datetimeFirst':datetimeNow, 'url':DRIVER.current_url, 'title':jobTitle, 'salaryAndContract':salaryAndContract, 'salaryMin':salaryMinAndMax[0], 'salaryMax':salaryMinAndMax[1], 'employer':employer, 'workModes':workModes, 'positionLevels':positionLevels, 'location':location, 'techstackExpected':techstackExpected, 'techstackOptional':techstackOptional, 'responsibilities':responsibilities, 'requirements':requirements, 'optionalRequirements':optionalRequirements, 'fullDescription':fullDescription}
    
# testing below
# DRIVER.get('https://justjoin.it/job-offer/jit-team-senior-frontend-developer-bialystok-javascript')
# DRIVER.switch_to.window(DRIVER.window_handles[-1])
# getOfferDetails()

### Scraping to database

In [None]:
from databaseFunctions import Database
from settings import DATABASE_COLUMNS
# import numpy as np

def scrapToDatabase():
    # timeDeltas = []
    inserts = 0
    updates = 0
    print(Database.countAllRecords() + ' records before run')
    # for i in range (0,2):
    for i in range (len(OFFERS_URLS)):
        DRIVER.get(OFFERS_URLS[i]['url'])
        if not offerNotFound():
            # LOOK FOR COMMON KEYS AS getOfferDetails() can return more keys than custom shortened DB has columns
            offerDetailsDict = getOfferDetails()
            # a dictionary containing only the keys appearing in both dictionaries
            commonKeysDict = {key: offerDetailsDict[key] for key in DATABASE_COLUMNS if key in offerDetailsDict}
            
            # # before = time.time()
            if Database.recordFound(DRIVER.current_url):
                Database.updateDatetimeLast(DRIVER.current_url)
                # print(driver.current_url)
                updates += 1
            else:
                Database.insertRecord(commonKeysDict) # insert into database
                inserts += 1
                # print('insert')
            # timeDeltas.append(time.time() - before)
            #ending here and starting in an above for/zip loop it takes ~(1/100)s - good enough
            print (str(i+1) + '/' + str(len(OFFERS_URLS)) + ' done')
        else:
            print('OFFER NOT FOUND: ' +  DRIVER.current_url)
        # time.sleep(random.uniform(0.35,0.85)) # Humanize requests frequency - justjoin slow already
    # print(np.mean(timeDeltas))
    print(str(inserts) + ' inserts | ' + str(updates) + ' updates')

scrapToDatabase()

### Test code

In [19]:
from settings import DATABASE_COLUMNS

# commonKeysDict = {key: offerDetailsDict[key] for key in DATABASE_COLUMNS if key in offerDetailsDict}

offerDetailsDict = {"datetimeLast":"aa", # don't remove this one tho as it's updated if offer url found in DB
"datetimeFirst":"bbb",
"optionalRequirements":"cccc",
"fullDescription":"dd"}

# dbColumnNameValuesDatabaseList = [item["dbColumnName"] for item in DATABASE_COLUMNS]
# dbColumnNameValuesResultsList = [item for item in offerDetailsDict]
commonKeysDict = {key: offerDetailsDict[key] for key in [item["dbColumnName"] for item in DATABASE_COLUMNS] if key in offerDetailsDict}

print(commonKeysDict)

{'datetimeLast': 'aa', 'datetimeFirst': 'bbb', 'optionalRequirements': 'cccc', 'fullDescription': 'dd'}
