## Open a browser and set the cookies from a JSON file

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import random

def setCookiesFromJson():
    with open('cookies.json', 'r', newline='') as inputdata:
        cookies = json.load(inputdata)
    for cookie in cookies: #works only after driver.get
        driver.add_cookie(cookie)
    driver.refresh() # to load cookies

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
driver = webdriver.Chrome(service=service, options=chrome_options)

base_url = "https://theprotocol.it/filtry/python;t"
# driver.get("https://theprotocol.it/filtry/python;t/trainee,assistant,junior;p/zdalna;rw/praca/data-scientist-deep-learning-warszawa,oferta,8e0c0000-a5a4-c69a-3164-08dcbb7766ef?s=-1499548823&searchId=20a1f390-66d0-11ef-8a60-57a9d1202edd")
driver.get(base_url)
setCookiesFromJson()

## Fetch the URLs from all the pages

In [3]:
def anyOffersOnTheList():
    try:
        driver.find_element(By.CSS_SELECTOR, '#main-offers-listing > div.hfenof > div.t2re51w > div')
        return False
    except:
        return True
    
offers_urls = []

def fetchOffersUrlsFromSinglePage():
    offersContainer = driver.find_element("xpath", '//*[@id="main-offers-listing"]/div[1]/div')
    offers = offersContainer.find_elements(By.CLASS_NAME, 'a4pzt2q ')
    # offers = offersContainer.find_elements(By.CSS_SELECTOR, '#offer-title')
    # print('\t'+ str(len(offers)) + ' offers:')
    for offer in offers:
        offers_urls.append(offer.get_property("href"))

page = 1
while True: # because not sure how many pages are there
    site = driver.get(base_url + "?pageNumber=" + str(page))
    if not anyOffersOnTheList():
        print('fetched ' + str(len(offers_urls)) + ' offer urls in total')
        break # break if no results
    else:
        time.sleep(random.uniform(0.5, 1)) #humanize
        fetchOffersUrlsFromSinglePage()
        print('page ' + str(page) + ' urls fetched')
        page += 1

page 1 urls fetched
page 2 urls fetched
page 3 urls fetched
page 4 urls fetched
page 5 urls fetched
page 6 urls fetched
page 7 urls fetched
page 8 urls fetched
page 9 urls fetched
page 10 urls fetched
page 11 urls fetched
page 12 urls fetched
fetched 590 offer urls in total


## Analyse offer

In [12]:
def getOfferDetails():
    #SALARY
    salaryContainer = driver.find_element(By.XPATH, '//*[@data-test="section-contract"]') # this element should always exist
    salary = salaryContainer.text
    # print(salary  + '\n')

    # EMPLOYER
    employerElement = driver.find_element("xpath", '//*[@data-test="anchor-company-link"]') # this element should always exist
    employer = employerElement.text + ' ' + employerElement.get_property("href")
    # print(employer  + '\n')
    
    #WORKFROM, EXP, LOCATION - "PARAMETERS"
    workModes, positionLevels, offerValidTo, location = '', '', '', ''
    parametersContainer = driver.find_element(By.CLASS_NAME, "c21kfgf")
    parameters = parametersContainer.find_elements(By.CLASS_NAME, "s1bu9jax")
    for param in parameters:
        paramType = param.get_attribute("data-test") #element description
        match paramType:
            case "section-workModes":
                workModes = param.text
            case "section-positionLevels":
                positionLevels = param.text
            case "section-offerValidTo":
                offerValidTo = param.text
            case "section-workplace":
                location = param.text
        # print(param.text + '\n')

    # print(workModes + '\n\n' + positionLevels + '\n\n' +  offerValidTo + '\n\n' +  location + '\n')

    # # # TECHSTACK
    descriptionsContainer = driver.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION')

    techstack = descriptionsContainer.find_elements(By.CLASS_NAME, "c1fj2x2p")
    techstackExpected = ''
    techstackOptional = ''
    for group in techstack:
        if group.text[0:8] == 'EXPECTED':
            techstackExpected = group.text[9:-1]
        elif group.text[0:8] == 'OPTIONAL':
            techstackOptional = group.text[9:-1]
    # print(techstackExpected + '\n\n' + techstackOptional + '\n')

    #RESPONSIBILITIES
    responsibilities = None
    try:
        try:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]/ul').text #/only ul elements
        except:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]').text #/if it's a single entry
    except:
        pass #do nothing as the value is already None
        print('RESPONSIBILITIES:\n' + str(responsibilities) + '\n' + driver.current_url)

    #REQUIREMENTS
    requirements = None
    try:
        try:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]/ul').text
        except:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]').text #/if it's a single entry
    except:
        pass #do nothing as the value is already None
        print('REQUIREMENTS:\n' + str(requirements) + '\n' + driver.current_url)


    #OPTIONAL REQUIREMENTS
    optionalRequirements = None
    try:
        optionalRequirementsContainer = descriptionsContainer.find_elements("xpath", '//*[@data-test="section-requirements-optional"]/li')
        if len(optionalRequirementsContainer) > 0:
            optionalRequirements = ''
            for optionalRequirement in optionalRequirementsContainer:
                optionalRequirements += optionalRequirement.text + '\n'
        elif len(optionalRequirementsContainer) <= 0:
            try:
                optionalRequirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
            except:
                pass #do nothing as the value is already None
                # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)        
    except:
        pass #do nothing as the value is already None
    # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)

    
for i in range (5,6):
    driver.get(offers_urls[i])
    print ('working on ' + str(i) + ' - ' +  driver.current_url)
    getOfferDetails()
    # time.sleep(random.uniform(5,15)) #Humanize request times
print('doned')

# # PROBLEMATIC OFFERS:
# driver.get('https://theprotocol.it/szczegoly/praca/web3-junior-python-developer-ka-warszawa-polczynska-112a,oferta,90c00000-1543-c628-7588-08dcc5c378f6?s=-684638444&searchId=d56840d0-6c44-11ef-a828-7d551d1dc879')
# getOfferDetails()

working on 5 - https://theprotocol.it/szczegoly/praca/python-developer-krakow-kapelanka-42a,oferta,f4300000-66b8-523b-4016-08dcbac6b719?s=-684638444&searchId=e71d31e0-6c6d-11ef-8a1f-19dd49dcc0b8
doned


In [227]:
####TODO: data-test="text-offerNotFound" - CHECK IF OFFER FOUND
# driver.get('https://theprotocol.it/szczegoly/praca/internship-in-the-it-area-php-ja1ee5-576a-08dcc1cef0c4?s=-684638444&searchId=e71d31e0-6c6d-11ef-')



#NOTES
# driver.execute_script("window.open(\"" + offer_url + "\");")
# time.sleep(0.5)
# driver.switch_to.window(driver.window_handles[1])
# driver.close()
# driver.switch_to.window(driver.window_handles[0])
# print(driver.current_url)

# try/except test
# val = 1 
# try:
#     try:
#         print('1')
#         val = val/0
#         print('post1') # not displayed
#     except:
#         print('2')
#         # val = ''
#         val += 'asd'
#         print('post2')        
# except:
#     print('3')
# finally:
#     print('fin ' + str(val))

1
2
3
fin 1
