### Open a browser and set the cookies from a JSON file

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random, re, datetime
import pandas as pd

def setCookiesFromJson():
    with open('cookies.json', 'r', newline='') as inputdata:
        cookies = json.load(inputdata)
    for cookie in cookies: #works only after driver.get
        driver.add_cookie(cookie)
    driver.refresh() # to load cookies

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()
chrome_options.add_argument("--disable-search-engine-choice-screen")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
driver = webdriver.Chrome(service=service, options=chrome_options)

# base_url = "https://theprotocol.it/filtry/python;t/ai-ml;sp"
# base_url = "https://theprotocol.it/filtry/python;t/ai-ml;sp/bialystok;wp"
base_url = "https://theprotocol.it/filtry/sql,python,javascript;t/junior,trainee,assistant;p"
# base_url = "https://theprotocol.it/filtry/helpdesk;sp/warszawa,bialystok;wp/zdalna,hybrydowa,stacjonarna;rw/"
# base_url = "https://theprotocol.it/filtry/trainee,assistant,junior;p"
driver.get(base_url)
setCookiesFromJson()

### Fetch the URLs from all the pages

In [2]:
def anyOffersOnTheList():
    try:
        driver.find_element(By.CSS_SELECTOR, '#main-offers-listing > div.hfenof > div.t2re51w > div')
        return False
    except:
        return True
    
offers_urls = []

def fetchOffersUrlsFromSinglePage():
    offersContainer = driver.find_element("xpath", '//*[@id="main-offers-listing"]/div[1]/div')
    offers = offersContainer.find_elements(By.CLASS_NAME, 'a4pzt2q ')
    # offers = offersContainer.find_elements(By.CSS_SELECTOR, '#offer-title') #also works
    # print('\t'+ str(len(offers)) + ' offers:')
    for offer in offers:
        offers_urls.append(offer.get_property("href"))

page = 1 #theprotocol enumerates pages starting from 1
while True: # because not sure how many pages are there
    site = driver.get(base_url + "?pageNumber=" + str(page))
    if not anyOffersOnTheList():
        print('fetched ' + str(len(offers_urls)) + ' offer urls in total')
        break # break if no results
    else:
        time.sleep(random.uniform(0.5, 1)) #humanize
        fetchOffersUrlsFromSinglePage()
        print('page ' + str(page) + ' urls fetched')
        page += 1

page 1 urls fetched
page 2 urls fetched
page 3 urls fetched
fetched 148 offer urls in total


### Analyse offer

In [3]:
def offerNotFound():
    try:
        driver.find_element("xpath", '//*[@data-test="text-offerNotFound"]')
        return True
    except:
        return False
    
def getOfferDetails():
    #JOB TITLE
    try:
        jobTitle = driver.find_element(By.XPATH, '//*[@data-test="text-offerTitle"]') # this element should always exist
        jobTitle = jobTitle.text
    except:
        jobTitle = None
    
    #SALARY
    try:
        salaryContainer = driver.find_element(By.XPATH, '//*[@data-test="section-contract"]') # this element should always exist
        salaryAndContract = salaryContainer.text
        # print(salaryAndContract  + '\n')
    except:
        salaryAndContract = None
    
    salaryMinAndMax = [None, None]
    if salaryAndContract:
        try: #to recalculate salary to [PLN/month net] #PLN=only unit on protocol?
            grossToNetMultiplier = 0.7
            hoursPerMonthInFullTimeJob = 168
            lines = salaryAndContract.splitlines()
            if len(lines) >= 3: #should be 2-3 tho
                lines[0] = lines[0].replace(" ", "") #remove spaces
                salaryMinAndMax = re.findall(r"\d+", lines[0]) #r = raw
                # salaryUnit = re.findall(r"[^\d–-]", lines[0]) #[exclude digits and –/-]
                # salaryUnit = ''.join(salaryUnit) #join list elements
                if re.findall("brutto", lines[1]) or re.findall("gross", lines[1]): # gross -> net
                    salaryMinAndMax = [(float(elmnt) * grossToNetMultiplier) for elmnt in salaryMinAndMax]
                if re.findall("godz", lines[1]) or re.findall("hr.", lines[1]): # hr -> month
                    salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str 
                salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
        except:
            pass    # salaryMinAndMax = [None, None]

    # EMPLOYER
    try:
        employerElement = driver.find_element("xpath", '//*[@data-test="anchor-company-link"]') # this element should always exist
        employer = employerElement.text + ' ' + employerElement.get_property("href")
    except:
        employer = None
    # print(employer  + '\n')
    
    #WORKFROM, EXP, VALIDTO, LOCATION - "PARAMETERS"
    workModes, positionLevels, offerValidTo, location = '', '', '', ''
    parametersContainer = driver.find_element(By.CLASS_NAME, "c21kfgf")
    parameters = parametersContainer.find_elements(By.CLASS_NAME, "s1bu9jax")
    for param in parameters:
        paramType = param.get_attribute("data-test") #element description
        match paramType:
            case "section-workModes":
                workModes = param.text
            case "section-positionLevels":
                positionLevels = param.text
            case "section-offerValidTo":
                offerValidTo = param.text
            case "section-workplace":
                location = param.text
                try: #to find and click 'more locations' button then fetch what's inside
                    moreLocations = driver.find_element("xpath", '//*[@data-test="button-locationPicker"]')
                    moreLocations.click()
                    # time.sleep(0.05) #probably necessary
                    locations = moreLocations.find_element("xpath", '//*[@data-test="modal-locations"]')
                    location = locations.text
                except:
                    pass #leave location as it was
    # print(workModes + '\n\n' + positionLevels + '\n\n' +  offerValidTo + '\n\n' +  location + '\n')

    # # # TECHSTACK
    descriptionsContainer = driver.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION')

    techstack = descriptionsContainer.find_elements(By.CLASS_NAME, "c1fj2x2p")
    techstackExpected = None
    techstackOptional = None
    for group in techstack:
        if group.text[0:8] == 'EXPECTED' or group.text[0:8] == 'WYMAGANE':
            techstackExpected = group.text[9:]
        elif group.text[0:8] == 'OPTIONAL': #never saw polish version yet
            techstackOptional = group.text[9:]
    # print(techstackExpected + '\n\n' + techstackOptional + '\n')

    #RESPONSIBILITIES
    try:
        try:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]/ul').text #/only ul elements
        except:
            responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]').text #/if it's a single entry
    except:
        responsibilities = None
        # print('RESPONSIBILITIES:\n' + str(responsibilities) + '\n' + driver.current_url)

    #REQUIREMENTS
    try:
        try:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]/ul').text
        except:
            requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]').text #/if it's a single entry
    except:
        requirements = None
        # print('REQUIREMENTS:\n' + str(requirements) + '\n' + driver.current_url)


    #OPTIONAL REQUIREMENTS
    try:
        optionalRequirementsContainer = descriptionsContainer.find_elements("xpath", '//*[@data-test="section-requirements-optional"]/li')
        if len(optionalRequirementsContainer) > 0:
            optionalRequirements = ''
            for optionalRequirement in optionalRequirementsContainer:
                optionalRequirements += optionalRequirement.text + '\n'
        elif len(optionalRequirementsContainer) <= 0:
            try:
                optionalRequirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
            except:
                optionalRequirements = None
                # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)        
    except:
        optionalRequirements = None
    # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)
    datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    return [datetimeNow, datetimeNow, driver.current_url, jobTitle, salaryAndContract, salaryMinAndMax[0], salaryMinAndMax[1], employer, workModes, positionLevels, offerValidTo, location, techstackExpected, techstackOptional, responsibilities, requirements, optionalRequirements]

# driver.get('https://theprotocol.it/szczegoly/praca/internship---tv-apps-and-services-intern-warszawa-plac-europejski-1,oferta,1a840000-f5ea-0ac0-4543-08dcce51a597?s=8321028996&searchId=a5384d80-770f-11ef-a8ea-4bb4c8c16093')
# getOfferDetails()

### Results to dataframe and CSV

In [4]:
# columns = ['datetime', 'url', 'title', 'salaryAndContract', salaryMin, salaryMax, 'employer', 'workModes', 'positionLevels', 'offerValidTo', 'location', 'techstackExpected', 'techstackOptional', 'responsibilities', 'requirements', 'optionalRequirements']
# resultsDataFrame = pd.DataFrame([], columns = columns) #create dataframe with columns only

# # for i in range (len(offers_urls)):
# for i in range (5,6):
#     driver.get(offers_urls[i])
#     if not offerNotFound():
#         resultsDataFrame.loc[len(resultsDataFrame.index)] = getOfferDetails() #append new row
#         print (str(i+1) + '/' + str(len(offers_urls)) + ' done')
#     else:
#         print('OFFER NOT FOUND: ' +  driver.current_url)
#     time.sleep(random.uniform(0.25,0.75)) #Humanize requests frequency

# # resultsDataFrame.to_csv('results.csv', sep=',', encoding='utf-8-sig', index=True, header=True) #export to CSV

### Database management functions

In [99]:
# print(resultsDataFrame.employer)
# # resultsDataFrame.to_sql('offers', 'resultsDf.db') #alchemy needed
import sqlite3

tableName = 'test4' #not needed as an argument

class database():
    def createTableIfNotExists(): #if not exists
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + """ (
                    datetimeFirst TEXT,
                    datetimeLast TEXT,
                    url TEXT,
                    title TEXT, 
                    salaryAndContract TEXT,
                    salaryMin INT,
                    salaryMax INT,
                    employer TEXT,
                    workModes TEXT,
                    positionLevels TEXT,
                    offerValidTo TEXT,
                    location TEXT,
                    techstackExpected TEXT,
                    techstackOptional TEXT,
                    responsibilities TEXT,
                    requirements TEXT,
                    optionalRequirements TEXT);""")
        connection.commit()
        cursor.close()
        connection.close()

    def selectAll():
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT * FROM" + tableName +";")
        connection.commit()
        print(cursor.fetchall())
        cursor.close()
        connection.close()

    def executeQuery(query):
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute(query)
        connection.commit()
        # print(cursor.fetchall())
        cursor.close()
        connection.close()
    
    def recordFound(url):
        urlPartToCompare = re.split("[?]s=", url)[0] #split on '?s=' because after that it's only session related stuff. If no pattern found url unchanged
        # print(urlPartToCompare)
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT datetimeFirst FROM " + tableName + " WHERE url LIKE ('%" + urlPartToCompare + "%');")
        connection.commit()
        result = cursor.fetchall()
        cursor.close()
        connection.close()
        if len(result) >0:
            return True
        else:
            return False

    def insertRecord(dictionary):
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("INSERT INTO " + tableName + " VALUES (:datetimeFirst, :datetimeLast, :url, :title, :salaryAndContract, :salaryMin, :salaryMax, :employer, :workModes, :positionLevels, :offerValidTo, :location, :techstackExpected, :techstackOptional, :responsibilities, :requirements, :optionalRequirements)", dictionary)
        connection.commit()
        cursor.close()
        connection.close()

    def updateDatetimeLast(url):
        urlPartToCompare = re.split("[?]s=", url)[0] #split on '?s=' because after that it's only session related stuff. If no pattern found url unchanged
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("UPDATE " + tableName + " SET datetimeLast = '" + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "'  WHERE url LIKE ('%" + urlPartToCompare + "%');")
        # cursor.execute("SELECT datetimeLast FROM " + tableName + " WHERE url LIKE ('%" + urlPartToCompare + "%');")
        connection.commit()
        cursor.close()
        connection.close()
    
    def countAllRecords():
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT COUNT (*) FROM " + tableName +";")
        connection.commit()
        resultTuple = cursor.fetchall()[0]
        (count,) = resultTuple #unpacking tuple
        cursor.close()
        connection.close()
        return str(count)

    def queryBuilder(onlyLastSeenLessThan24hAgo, keywordsNecessary, keywordsUnacceptable): #keywords lists
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        fullQuery = "SELECT datetimeLast, techstackExpected FROM " + tableName + " WHERE 1=1" #Includes WHERE to always append a line starting with AND
        if onlyLastSeenLessThan24hAgo:
            fullQuery += "\nAND (JULIANDAY(strftime('%Y-%m-%d %H:%M:%S', DATETIME('now', 'localtime'))) - JULIANDAY(datetimeLast)) * 24 < 24" 
        if keywordsNecessary:
            for keyword in keywordsNecessary:
                fullQuery += "\nAND (techstackExpected LIKE ('%"+keyword+"%') OR requirements LIKE ('%"+keyword+"%'))"
        if keywordsUnacceptable:
            for keyword in keywordsUnacceptable:
                fullQuery += "\nAND techstackExpected NOT LIKE ('%"+keyword+"%') AND requirements NOT LIKE ('%"+keyword+"%')"
        fullQuery += ';'
        print(fullQuery)
        cursor.execute(fullQuery)
        connection.commit()
        # print(cursor.fetchall())
        print('\n'+str(len(cursor.fetchall())) + ' records found')
        cursor.close()
        connection.close()
    
database.createTableIfNotExists()
database.queryBuilder(True, ['SQL'], ['javascript', 'c++'])
database.countAllRecords()
# database.executeQuery("DROP TABLE" + tableName)

SELECT datetimeLast, techstackExpected FROM test4 WHERE 1=1
AND (JULIANDAY(strftime('%Y-%m-%d %H:%M:%S', DATETIME('now', 'localtime'))) - JULIANDAY(datetimeLast)) * 24 < 24
AND (techstackExpected LIKE ('%SQL%') OR requirements LIKE ('%SQL%'))
AND techstackExpected NOT LIKE ('%javascript%') AND requirements NOT LIKE ('%javascript%')
AND techstackExpected NOT LIKE ('%c++%') AND requirements NOT LIKE ('%c++%');

61 records found


'171'

In [8]:
columns = ['datetimeFirst', 'datetimeLast', 'url', 'title', 'salaryAndContract', 'salaryMin', 'salaryMax', 'employer', 'workModes', 'positionLevels', 'offerValidTo', 'location', 'techstackExpected', 'techstackOptional', 'responsibilities', 'requirements', 'optionalRequirements']

import numpy as np

timeDeltas = []
inserts = 0
updates = 0
print(database.countAllRecords() + ' records before run')
# for i in range (0,2):
for i in range (len(offers_urls)):
    driver.get(offers_urls[i])
    if not offerNotFound():
        resultsList = getOfferDetails()
        outputDictionary = {}
        for column, offerDetail in zip(columns, resultsList):
            outputDictionary[column] = offerDetail #combine 2 lists into 1 dictionary
        before = time.time()
        if database.recordFound(driver.current_url):
            database.updateDatetimeLast(driver.current_url)
            # print(driver.current_url)
            updates += 1
        else:
            database.insertRecord(outputDictionary) # insert into databas
            inserts += 1
            # print('insert')
        timeDeltas.append(time.time() - before)
        #ending here and starting in an above for/zip loop it takes ~(1/100)s - good enough
        print (str(i+1) + '/' + str(len(offers_urls)) + ' done')
    else:
        print('OFFER NOT FOUND: ' +  driver.current_url)
    time.sleep(random.uniform(0.35,0.85)) #Humanize requests frequency
# print(np.mean(timeDeltas))
print(str(inserts) + ' inserts | ' + str(updates) + ' updates')

148 records before run
1/148 done
2/148 done
3/148 done
4/148 done
5/148 done
6/148 done
7/148 done
8/148 done
9/148 done
10/148 done
11/148 done
12/148 done
13/148 done
14/148 done
15/148 done
16/148 done
17/148 done
18/148 done
19/148 done
20/148 done
21/148 done
22/148 done
23/148 done
24/148 done
25/148 done
26/148 done
27/148 done
28/148 done
29/148 done
30/148 done
31/148 done
32/148 done
33/148 done
34/148 done
35/148 done
36/148 done
37/148 done
38/148 done
39/148 done
40/148 done
41/148 done
42/148 done
43/148 done
44/148 done
45/148 done
46/148 done
47/148 done
48/148 done
49/148 done
50/148 done
51/148 done
52/148 done
53/148 done
54/148 done
55/148 done
56/148 done
57/148 done
58/148 done
59/148 done
60/148 done
61/148 done
62/148 done
63/148 done
64/148 done
65/148 done
66/148 done
67/148 done
68/148 done
69/148 done
70/148 done
71/148 done
72/148 done
73/148 done
74/148 done
75/148 done
76/148 done
77/148 done
78/148 done
79/148 done
80/148 done
81/148 done
82/148 done
83

In [35]:
## TODO: waluty -> zł? chyba nie ma innej waluty na stronie
## elif group.text[0:8] == 'OPTIONAL': #never saw polish version yet
# wykresy posortowane po salaryMin/Max
# wykresy po zliczeniu słów?
# możliwość łączenia filtrów i pokazywania wyników na wykresie posortowanych po x?
# search or exclude by keyword

15
