### Open a browser and set the cookies from a JSON file

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random
import pandas as pd
import datetime

def setCookiesFromJson():
    with open('cookies.json', 'r', newline='') as inputdata:
        cookies = json.load(inputdata)
    for cookie in cookies: #works only after driver.get
        driver.add_cookie(cookie)
    driver.refresh() # to load cookies

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
driver = webdriver.Chrome(service=service, options=chrome_options)

base_url = "https://theprotocol.it/filtry/python;t/ai-ml;sp"
# base_url = "https://theprotocol.it/filtry/python;t/ai-ml;sp/bialystok;wp"
driver.get(base_url)
setCookiesFromJson()

### Fetch the URLs from all the pages

In [25]:
def anyOffersOnTheList():
    try:
        driver.find_element(By.CSS_SELECTOR, '#main-offers-listing > div.hfenof > div.t2re51w > div')
        return False
    except:
        return True
    
offers_urls = []

def fetchOffersUrlsFromSinglePage():
    offersContainer = driver.find_element("xpath", '//*[@id="main-offers-listing"]/div[1]/div')
    offers = offersContainer.find_elements(By.CLASS_NAME, 'a4pzt2q ')
    # offers = offersContainer.find_elements(By.CSS_SELECTOR, '#offer-title') #also works
    # print('\t'+ str(len(offers)) + ' offers:')
    for offer in offers:
        offers_urls.append(offer.get_property("href"))

page = 1 #theprotocol enumerates pages starting from 1
while True: # because not sure how many pages are there
    site = driver.get(base_url + "?pageNumber=" + str(page))
    if not anyOffersOnTheList():
        print('fetched ' + str(len(offers_urls)) + ' offer urls in total')
        break # break if no results
    else:
        time.sleep(random.uniform(0.5, 1)) #humanize
        fetchOffersUrlsFromSinglePage()
        print('page ' + str(page) + ' urls fetched')
        page += 1

page 1 urls fetched
page 2 urls fetched
fetched 60 offer urls in total


### Analyse offer

In [26]:
def offerNotFound():
    try:
        driver.find_element("xpath", '//*[@data-test="text-offerNotFound"]')
        return True
    except:
        return False
    
def getOfferDetails():
    #JOB TITLE
    jobTitle = driver.find_element(By.XPATH, '//*[@data-test="text-offerTitle"]') # this element should always exist
    jobTitle = jobTitle.text
    
    #SALARY
    salaryContainer = driver.find_element(By.XPATH, '//*[@data-test="section-contract"]') # this element should always exist
    salary = salaryContainer.text
    # print(salary  + '\n')

    # EMPLOYER
    employerElement = driver.find_element("xpath", '//*[@data-test="anchor-company-link"]') # this element should always exist
    employer = employerElement.text + ' ' + employerElement.get_property("href")
    # print(employer  + '\n')
    
    #WORKFROM, EXP, VALIDTO, LOCATION - "PARAMETERS"
    workModes, positionLevels, offerValidTo, location = '', '', '', ''
    parametersContainer = driver.find_element(By.CLASS_NAME, "c21kfgf")
    parameters = parametersContainer.find_elements(By.CLASS_NAME, "s1bu9jax")
    for param in parameters:
        paramType = param.get_attribute("data-test") #element description
        match paramType:
            case "section-workModes":
                workModes = param.text
            case "section-positionLevels":
                positionLevels = param.text
            case "section-offerValidTo":
                offerValidTo = param.text
            case "section-workplace":
                location = param.text
                try: #to find and click 'more locations' button then fetch what's inside
                    moreLocations = driver.find_element("xpath", '//*[@data-test="button-locationPicker"]')
                    moreLocations.click()
                    # time.sleep(0.05) #probably necessary
                    locations = moreLocations.find_element("xpath", '//*[@data-test="modal-locations"]')
                    location = locations.text
                except:
                    pass #leave location as it was
    # print(workModes + '\n\n' + positionLevels + '\n\n' +  offerValidTo + '\n\n' +  location + '\n')

    # # # TECHSTACK
    descriptionsContainer = driver.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION')

    techstack = descriptionsContainer.find_elements(By.CLASS_NAME, "c1fj2x2p")
    techstackExpected = ''
    techstackOptional = ''
    for group in techstack:
        if group.text[0:8] == 'EXPECTED':
            techstackExpected = group.text[9:]
        elif group.text[0:8] == 'OPTIONAL':
            techstackOptional = group.text[9:]
    # print(techstackExpected + '\n\n' + techstackOptional + '\n')

    #RESPONSIBILITIES
    responsibilities = None
    try:
        try:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]/ul').text #/only ul elements
        except:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]').text #/if it's a single entry
    except:
        pass #do nothing as the value is already None
        # print('RESPONSIBILITIES:\n' + str(responsibilities) + '\n' + driver.current_url)

    #REQUIREMENTS
    requirements = None
    try:
        try:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]/ul').text
        except:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]').text #/if it's a single entry
    except:
        pass #do nothing as the value is already None
        # print('REQUIREMENTS:\n' + str(requirements) + '\n' + driver.current_url)


    #OPTIONAL REQUIREMENTS
    optionalRequirements = None
    try:
        optionalRequirementsContainer = descriptionsContainer.find_elements("xpath", '//*[@data-test="section-requirements-optional"]/li')
        if len(optionalRequirementsContainer) > 0:
            optionalRequirements = ''
            for optionalRequirement in optionalRequirementsContainer:
                optionalRequirements += optionalRequirement.text + '\n'
        elif len(optionalRequirementsContainer) <= 0:
            try:
                optionalRequirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
            except:
                pass #do nothing as the value is already None
                # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)        
    except:
        pass #do nothing as the value is already None
    # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)

    return [str(datetime.datetime.now()), driver.current_url, jobTitle, salary, employer, workModes, positionLevels, offerValidTo, location, techstackExpected, techstackOptional, responsibilities, requirements, optionalRequirements]

# driver.get(offers_urls[1])
# getOfferDetails()

### Results to dataframe and CSV

In [27]:
columns = ['datetime', 'url', 'title', 'salary', 'employer', 'workModes', 'positionLevels', 'offerValidTo', 'location', 'techstackExpected', 'techstackOptional', 'responsibilities', 'requirements', 'optionalRequirements']
resultsDataFrame = pd.DataFrame([], columns = columns) #create dataframe with columns only

# for i in range (len(offers_urls)):
for i in range (5,6):
    driver.get(offers_urls[i])
    if not offerNotFound():
        resultsDataFrame.loc[len(resultsDataFrame.index)] = getOfferDetails() #append new row
        print (str(i+1) + '/' + str(len(offers_urls)) + ' done')
    else:
        print('OFFER NOT FOUND: ' +  driver.current_url)
    time.sleep(random.uniform(0.25,0.75)) #Humanize requests frequency
# print('all doned')
print(resultsDataFrame.title)

# resultsDataFrame.to_csv('results.csv', sep=',', encoding='utf-8-sig', index=True, header=True) #export to CSV

6/60 done
0    Inżynier Uczenia Maszynowego
Name: title, dtype: object


### Database management functions

In [109]:
# print(resultsDataFrame.employer)
# # resultsDataFrame.to_sql('offers', 'resultsDf.db') #alchemy needed
import sqlite3

def selectAll():
    connection = sqlite3.connect('results.db')
    cursor = connection.cursor()
    cursor.execute("SELECT * FROM test")
    connection.commit()
    print(cursor.fetchall())
    cursor.close()
    connection.close()

def executeQuery(query):
    connection = sqlite3.connect('results.db')
    cursor = connection.cursor()
    cursor.execute(query)
    connection.commit()
    cursor.close()
    connection.close()

def insertRecord(dictionary):
    connection = sqlite3.connect('results.db')
    cursor = connection.cursor()
    cursor.execute("INSERT INTO test VALUES (:col1, :col2, :col3)", dictionary)
    connection.commit()
    cursor.close()
    connection.close()

# executeQuery("""CREATE TABLE IF NOT EXISTS test(
#              datetime TEXT, 
#              url TEXT, 
#              salary TEXT,
#              employer TEXT,
#              workModes TEXT,
#              positionLevels TEXT,
#              offerValidTo TEXT,
#              location TEXT,
#              techstackExpected TEXT,
#              techstackOptional TEXT,
#              responsibilities TEXT,
#              requirements TEXT,
#              optionalRequirements TEXT
#              )""")

# executeQuery("DROP TABLE test")

executeQuery("""CREATE TABLE IF NOT EXISTS test(col1 INTEGER, col2 TEXT, col3 INTEGER)""")

selectAll()
insertRecord({'col1': 2, 'col2': None, 'col3': 3})

selectAll()
# executeQuery("DROP TABLE test")

# executeQuery("""CREATE TABLE IF NOT EXISTS test(col1 INTEGER, col2 TEXT, col3 INTEGER)""")
# insertRecord({'col1': 2, 'col2': 'ixde', 'col3': 4})

[(2, None, 4), (2, None, 4)]
[(2, None, 4), (2, None, 4), (2, None, 3)]
