### Open a browser and set the cookies from a JSON file

In [682]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,json, random, re, datetime
import pandas as pd
pd.options.mode.copy_on_write = True # recommended - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

# ChromeDriver should match browser version. If outdated download from:
# https://googlechromelabs.github.io/chrome-for-testing/

def setCookiesFromJson():  
    try:
        DRIVER.get(BASE_URL) #RUN BROWSER
        currentUrlDomain = DRIVER.current_url
        currentUrlDomain = re.search(r'^https?://([^/]+)', currentUrlDomain)
        currentUrlDomain = currentUrlDomain.group(1)  
        currentUrlDomain = re.sub(r'^www\.', '', currentUrlDomain)
        currentUrlDomain = re.sub(r'^\.', '', currentUrlDomain)
        # print(currentUrlDomain)
        with open('cookies.json', 'r', newline='') as inputdata:
            cookies = json.load(inputdata)
            cookiesAdded = 0
            for cookie in cookies: #works only after driver.get
                if re.match(r".?"+currentUrlDomain, cookie['domain']): # can only add cookies for current domain
                    DRIVER.add_cookie(cookie)
                    cookiesAdded += 1
            if cookiesAdded > 0:
                DRIVER.refresh() # to load cookies
                return {'success':True, 'functionDone':True, 'message':'cookies for ' + currentUrlDomain + ' successfully set'}
            elif (cookiesAdded == 0):
                return {'success':False, 'functionDone':True, 'message':'no cookies for ' + currentUrlDomain + ' found in cookies.json'}
    except Exception as exception:
        return {'success':False, 'functionDone':True, 'message':str(exception)} # 'functionDone':True because it's not necessary

service = Service(executable_path="chromedriver.exe")
chrome_options = Options()

chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("window-size=800,1000")
# chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) #disable error logging
DRIVER = webdriver.Chrome(service=service, options=chrome_options)

BASE_URL = "https://justjoin.it/job-offers/bialystok?with-salary=yes"
DRIVER.get(BASE_URL)
setCookiesFromJson()

{'success': True,
 'functionDone': True,
 'message': 'cookies for justjoin.it successfully set'}

### Fetch the URLs from all the pages

In [683]:
# def offerNotFound():
#     try:
#         elemnt = DRIVER.find_element("xpath", '/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/p')
#         # print(el.get_attribute('data-index'))
#         print(elemnt.text)
#         if elemnt.text == 'We did not find any offers for the above search criteria.':
#             return True
#     except:
#         return False
# offerNotFound()

offers_urls = []
lastIndexesList = []

def anyOffersOnTheList():
    try:
        content = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-ggjav7')
        offersList = content.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]')
        offers = offersList.find_elements(By.XPATH, '//div[@data-index]')
        if len(offers) > 0:
            return True
    except:
        return False
anyOffersOnTheList()

def fetchOffersUrlsFromSinglePage():
    # offersFound = content.find_element(By.CSS_SELECTOR, '.MuiTypography-root.MuiTypography-subtitle4.css-pmys26')
    # print(offersFound.text) # DOESNT MATCH AMOUNT COUNT BY DATA-INDEX
    content = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-ggjav7')
    offersList = content.find_element(By.XPATH, '//*[@data-test-id="virtuoso-item-list"]')
    offers = offersList.find_elements(By.XPATH, '//div[@data-index]') # 30 or 31 offers
    for offer in offers:
        lastIndexesList.append(offer.get_attribute('data-index'))
        hrefElement = offer.find_element(By.XPATH, ".//div/div/a")
        offers_urls.append(hrefElement.get_property("href"))

# for i in range(10):
fetchOffersUrlsFromSinglePage()
DRIVER.execute_script("window.scrollBy(0, innerHeight);")
print(lastIndexesList[0], lastIndexesList[-1])

0 23


### Analyse offer functions

In [690]:
# try:
#     jobTitle = DRIVER.find_elements(By.CSS_SELECTOR, '.css-sy4ig6')
#     print(jobTitle)
#     jobTitle = jobTitle.text
# except:
#     jobTitle = None
# print(jobTitle)

# singleLocation = https://justjoin.it/job-offer/emagine-polska-technical-architect-warszawa-architecture

def getOfferDetails():
    # BASIC PARAMETERS WHICH SHOULD ALWAYS BE NOT EMPTY ON THE SITE
    try:
        offerContent = DRIVER.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-tnvghs')
        topContainer = offerContent.find_element(By.CSS_SELECTOR, 'div') # first div 1 level down
        topDiv = topContainer.find_element(By.XPATH, ".//*[contains(@class, 'css-10x887j')]") # .// = as deep as necessary
    except Exception as exception:
        # print(exception)
        return # no point of continuing
    try:
        jobTitle = topDiv.find_element(By.CSS_SELECTOR, 'h1').text
        # print(jobTitle)
        employerAndLocationDiv = topDiv.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-yd5zxy') 
        employer = employerAndLocationDiv.find_element(By.XPATH, './/h2').text # look for h2 as deep as necessary
        # print(employer) # name="multilocation_button"
    except:
        jobTitle, employer = '', ''

    try:
        location = employerAndLocationDiv.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-mswf74')[1].text # first one is employer
        # location = re.sub(r'\+[0-9]+$', '', location) #remove '+x' where x is int
    except:
        location = ''
    #try clicking for more locations
    try:
        locationButton = employerAndLocationDiv.find_element("xpath", '//*[@name="multilocation_button"]')
        locationButton.click()
        locationsMenu = offerContent.find_element("xpath", '//ul[@role="menu"]')
        # locationsMenu = locationsMenu.find_elements(By.CSS_SELECTOR, 'li') # 1 level down
        location += '\n' + locationsMenu.text # TEXT EMPTY WHEN MINIMIZED!
    except Exception as exception:
        pass
    # print(location)

    #SALARY
    try:
        salaryAndContract = topContainer.find_element(By.CSS_SELECTOR, '.MuiBox-root.css-1km0bek').text
    except:
        salaryAndContract= ''

    salaryMinAndMax = [None, None] # Nones as these are INTs in DB
    if salaryAndContract != '':
        try: #to recalculate salary to [PLN/month net]
            grossToNetMultiplier = 0.7
            hoursPerMonthInFullTimeJob = 168
            lines = salaryAndContract.splitlines()[0] # There could be multiple salaries depending on contract type though. It will be in salaryAndContract anyway
            splitValues = re.split(r'-', lines) # split on dash for min and max

            for i in range(len(splitValues)):
                splitValues[i] = splitValues[i].replace(" ", "") # remove spaces
                splitValues[i] = re.sub(r",\d{1,2}", '', splitValues[i]) # removes , and /d{1 to 2 occurrences}  (needed when salary as 123,45)
                salaryMinAndMax[i] = re.search(r"\d+", splitValues[i]).group() # r = raw, \d+ = at least 1 digit, group() contains results
                
            if re.findall("brutto", lines[1]) or re.findall("gross", lines[1]): # gross -> net
                salaryMinAndMax = [(float(elmnt) * grossToNetMultiplier) for elmnt in salaryMinAndMax]
                # print(salaryMinAndMax)
            if re.findall("godz", lines[1]) or re.findall("hr.", lines[1]): # hr -> month
                salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str

            salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
        except Exception as exception:
            pass    # salaryMinAndMax = [None, None]
    # print(salaryMinAndMax)

    # print(salaryAndContract)
    workModes = ''
    positionLevels = ''

    try:
        fourRectangles = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-snbmy4') # contains just what we need
        salaryAndContract += '\n' + fourRectangles[0].text + ' | ' + fourRectangles[2].text
        positionLevels = fourRectangles[1].text
        workModes = fourRectangles[3].text
    except Exception as exception:
        # print(exception)
        pass
    # print(salaryAndContract)
    # print(workModes, positionLevels + '\n')

    #TECHSTACK
    techstackExpected, techstackOptional = '', ''
    try:
        techstackDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[0]
        techstackDiv = techstackDiv.find_element(By.CSS_SELECTOR, 'div') # 1 level down
        technologies = techstackDiv.find_elements(By.XPATH, './/h4') # look for h4 in all children elements
        levels = techstackDiv.find_elements(By.XPATH, './/span') # look for h4 in all children elements
        for i in range(len(technologies)):
            techWithLvl = technologies[i].text + ' - ' + levels[i].text
            # print(techWithLvl)
            if levels[i].text == 'Nice To Have': # or levels[i].text == 'Junior'
                techstackOptional += '\n' + techWithLvl
            else: # -(nice to have)/junior/regular/advanced/master
                techstackExpected += '\n' + techWithLvl

        techstackOptional = re.sub(r"^\n", '', techstackOptional)
        techstackExpected = re.sub(r"^\n", '', techstackExpected)
    except:
        pass # leave empty strs
    # print(techstackExpected + '\n\n' + techstackOptional)
    print('===========================')

    # DO THIS NOW
    offerValidTo = '' # REMOVE FROM DB?
    responsibilities, requirements, optionalRequirements = '', '', '' 
    try:
        descriptionDiv = offerContent.find_elements(By.CSS_SELECTOR, '.MuiBox-root.css-qal8sw')[1]
        descriptionDiv = descriptionDiv.find_elements(By.XPATH, "./div") # 1 level down
        print(descriptionDiv[1].text)
        # Must have | Nice to have skills: | Responsibilities:
        # DODATKOWE INFORMACJE: | WYMAGANIA:
        # print(len(descriptionDiv))

    except:
        pass

    print(responsibilities, requirements, optionalRequirements)

    datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    return [datetimeNow, datetimeNow, DRIVER.current_url, jobTitle, salaryAndContract, salaryMinAndMax[0], salaryMinAndMax[1], employer, workModes, positionLevels, offerValidTo, location, techstackExpected, techstackOptional, responsibilities, requirements, optionalRequirements]

# driver.get('')

getOfferDetails()

Salary: 1000 - 1200 PLN/MD + VAT on B2B
Remote: 100% remote or hybrid from Warsaw, Tricity or Łódź
  Why choose this offer?
You can expect a flexible work organization
The international work environment will give you the opportunity to interact with the English language on a daily basis
Scandinavian organizational culture will provide you with work-life balance, you will gain time for additional training (financed by Jit)
The Jit community will bring you a nice time during regular integration meetings
  Project
The project encompasses the full onboarding of applications and platforms into CyberArk solution, including all stages from integration to password management. The scope involves building custom connection components and password managers, enabling secure and efficient access management across the organization. The goal is to ensure seamless integration of systems with the CyberArk platform, allowing for centralized password and access management while enhancing the overall secu

['2024-12-23 10:39:11',
 '2024-12-23 10:39:11',
 'https://justjoin.it/job-offer/jit-team-senior-it-security-consultant-with-pam-bialystok-security',
 'Senior IT Security Consultant with PAM',
 '21 000 - 25 000 PLN\nNet/month - B2B\nFull-time | B2B',
 21000,
 25000,
 'Jit Team',
 'Remote',
 'Senior',
 '',
 'Białystok, +9\nWarszawa, Aleje Jerozolimskie\nWrocław, -\nGdańsk, -\nGdynia, -\nKraków, -\nKatowice, -\nPoznań, -\nToruń, -\nŁódź, -',
 'Cybersecurity - Advanced\nEnglish - Advanced\nAgile - Regular',
 '',
 '',
 '',
 '']

In [None]:
import random

strng = ''
for i in range (10):
    try:
        if random.random() > 0.5:
            strng += 'a'
        else:
            strng += 2
    except:
        pass

print(strng)

In [None]:
# def getOfferDetails():
#     #JOB TITLE
#     try:
#         jobTitle = driver.find_element(By.XPATH, '//*[@data-test="text-offerTitle"]') # this element should always exist
#         jobTitle = jobTitle.text
#     except:
#         jobTitle = None
    
#     #SALARY
#     try:
#         salaryContainer = driver.find_element(By.XPATH, '//*[@data-test="section-contract"]') # this element should always exist
#         salaryAndContract = salaryContainer.text
#         # print(salaryAndContract  + '\n')
#     except:
#         salaryAndContract = None
    
#     salaryMinAndMax = [None, None]
#     if salaryAndContract:
#         try: #to recalculate salary to [PLN/month net] #PLN=only unit on protocol?
#             grossToNetMultiplier = 0.7
#             hoursPerMonthInFullTimeJob = 168
#             lines = salaryAndContract.splitlines()
#             if len(lines) >= 3: #should be 2-3 tho
#                 lines[0] = lines[0].replace(" ", "") #remove spaces
#                 lines[0] = re.sub(r",\d{1,2}", '', lines[0]) #removes dash and /d x(1-2)  (needed when salary as 123,45)
#                 salaryMinAndMax = re.findall(r"\d+", lines[0]) #r = raw
#                 # print(salaryMinAndMax.split(',', 1)[0])
#                 # salaryUnit = re.findall(r"[^\d–-]", lines[0]) #[exclude digits and –/-]
#                 # salaryUnit = ''.join(salaryUnit) #join list elements
#                 if re.findall("brutto", lines[1]) or re.findall("gross", lines[1]): # gross -> net
#                     salaryMinAndMax = [(float(elmnt) * grossToNetMultiplier) for elmnt in salaryMinAndMax]
#                     # print(salaryMinAndMax)
#                 if re.findall("godz", lines[1]) or re.findall("hr.", lines[1]): # hr -> month
#                     salaryMinAndMax = [(float(elmnt) * hoursPerMonthInFullTimeJob) for elmnt in salaryMinAndMax] #possible input float/str

#                 salaryMinAndMax = [int(elmnt) for elmnt in salaryMinAndMax] # to ints
#         except:
#             pass    # salaryMinAndMax = [None, None]

#     # EMPLOYER
#     try:
#         employerElement = driver.find_element("xpath", '//*[@data-test="anchor-company-link"]') # this element should always exist
#         employer = employerElement.text + ' ' + employerElement.get_property("href")
#     except:
#         employer = None
#     # print(employer  + '\n')
    
#     #WORKFROM, EXP, VALIDTO, LOCATION - "PARAMETERS"
#     workModes, positionLevels, offerValidTo, location = '', '', '', ''
#     parametersContainer = driver.find_element(By.CLASS_NAME, "c21kfgf")
#     parameters = parametersContainer.find_elements(By.CLASS_NAME, "s1bu9jax")
#     for param in parameters:
#         paramType = param.get_attribute("data-test") #element description
#         match paramType:
#             case "section-workModes":
#                 workModes = param.text
#             case "section-positionLevels":
#                 positionLevels = param.text
#             case "section-offerValidTo":
#                 offerValidTo = param.text
#             case "section-workplace":
#                 location = param.text
#                 try: #to find and click 'more locations' button then fetch what's inside
#                     moreLocations = driver.find_element("xpath", '//*[@data-test="button-locationPicker"]')
#                     moreLocations.click()
#                     # time.sleep(0.05) #probably necessary
#                     locations = moreLocations.find_element("xpath", '//*[@data-test="modal-locations"]')
#                     location = locations.text
#                 except:
#                     pass #leave location as it was
#     # print(workModes + '\n\n' + positionLevels + '\n\n' +  offerValidTo + '\n\n' +  location + '\n')

#     #TECHSTACK
#     descriptionsContainer = driver.find_element(By.CSS_SELECTOR, '#TECHNOLOGY_AND_POSITION')

#     techstack = descriptionsContainer.find_elements(By.CLASS_NAME, "c1fj2x2p")
#     techstackExpected = None
#     techstackOptional = None
#     for group in techstack:
#         if group.text[0:8] == 'EXPECTED' or group.text[0:8] == 'WYMAGANE': # eng/pl same word length
#             techstackExpected = group.text[9:]
#         elif group.text[0:8] == 'OPTIONAL':
#             techstackOptional = group.text[9:]
#         elif group.text[0:13] == 'MILE WIDZIANE': # polish version
#             techstackOptional = group.text[14:]
#     # print(str(techstackExpected) + '\n\n' + str(techstackOptional) + '\n')

#     #RESPONSIBILITIES
#     try:
#         try:
#             responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]/ul').text #/only ul elements
#         except:
#             responsibilities = descriptionsContainer.find_element("xpath", '//*[@data-test="section-responsibilities"]').text #/if it's a single entry
#     except:
#         responsibilities = None
#         # print('RESPONSIBILITIES:\n' + str(responsibilities) + '\n' + driver.current_url)

#     #REQUIREMENTS
#     try:
#         try:
#             requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]/ul').text
#         except:
#             requirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements"]').text #/if it's a single entry
#     except:
#         requirements = None
#         # print('REQUIREMENTS:\n' + str(requirements) + '\n' + driver.current_url)


#     #OPTIONAL REQUIREMENTS
#     try:
#         optionalRequirementsContainer = descriptionsContainer.find_elements("xpath", '//*[@data-test="section-requirements-optional"]/li')
#         if len(optionalRequirementsContainer) > 0:
#             optionalRequirements = ''
#             for optionalRequirement in optionalRequirementsContainer:
#                 optionalRequirements += optionalRequirement.text + '\n'
#         elif len(optionalRequirementsContainer) <= 0:
#             try:
#                 optionalRequirements = descriptionsContainer.find_element("xpath", '//*[@data-test="section-requirements-optional"]').text
#             except:
#                 optionalRequirements = None
#                 # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)        
#     except:
#         optionalRequirements = None
#     # print('OPTIONAL:\n' + str(optionalRequirements) + '\n' + driver.current_url)
#     datetimeNow = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
#     return [datetimeNow, datetimeNow, driver.current_url, jobTitle, salaryAndContract, salaryMinAndMax[0], salaryMinAndMax[1], employer, workModes, positionLevels, offerValidTo, location, techstackExpected, techstackOptional, responsibilities, requirements, optionalRequirements]

# driver.get('https://theprotocol.it/szczegoly/praca/mlodszy-specjalista-it-warszawa-nowoursynowska-162j,oferta,f03f0000-5202-f248-4bdc-08dce9bbe033?s=-3293542755&searchId=533dda90-88b7-11ef-942d-5f9061073a19')
# getOfferDetails()



### Database management functions

In [None]:
# print(resultsDataFrame.employer)
# # resultsDataFrame.to_sql('offers', 'resultsDf.db') #alchemy needed
import sqlite3

tableName = 'test4' #not needed as an argument

class database():
    def createTableIfNotExists(): #if not exists
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("CREATE TABLE IF NOT EXISTS " + tableName + """ (
                    datetimeFirst TEXT,
                    datetimeLast TEXT,
                    url TEXT,
                    title TEXT, 
                    salaryAndContract TEXT,
                    salaryMin INT,
                    salaryMax INT,
                    employer TEXT,
                    workModes TEXT,
                    positionLevels TEXT,
                    offerValidTo TEXT,
                    location TEXT,
                    techstackExpected TEXT,
                    techstackOptional TEXT,
                    responsibilities TEXT,
                    requirements TEXT,
                    optionalRequirements TEXT);""")
        connection.commit()
        cursor.close()
        connection.close()

    def selectAll():
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT * FROM" + tableName +";")
        connection.commit()
        print(cursor.fetchall())
        cursor.close()
        connection.close()

    def executeQuery(query):
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute(query)
        connection.commit()
        # print(cursor.fetchall())
        cursor.close()
        connection.close()
    
    def recordFound(url):
        urlPartToCompare = re.split("[?]s=", url)[0] #split on '?s=' because after that it's only session related stuff. If no pattern found url unchanged
        # print(urlPartToCompare)
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT datetimeFirst FROM " + tableName + " WHERE url LIKE ('%" + urlPartToCompare + "%');")
        connection.commit()
        result = cursor.fetchall()
        cursor.close()
        connection.close()
        if len(result) >0:
            return True
        else:
            return False

    def insertRecord(dictionary):
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("INSERT INTO " + tableName + " VALUES (:datetimeFirst, :datetimeLast, :url, :title, :salaryAndContract, :salaryMin, :salaryMax, :employer, :workModes, :positionLevels, :offerValidTo, :location, :techstackExpected, :techstackOptional, :responsibilities, :requirements, :optionalRequirements)", dictionary)
        connection.commit()
        cursor.close()
        connection.close()

    def updateDatetimeLast(url):
        urlPartToCompare = re.split("[?]s=", url)[0] #split on '?s=' because after that it's only session related stuff. If no pattern found url unchanged
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("UPDATE " + tableName + " SET datetimeLast = '" + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "'  WHERE url LIKE ('%" + urlPartToCompare + "%');")
        # cursor.execute("SELECT datetimeLast FROM " + tableName + " WHERE url LIKE ('%" + urlPartToCompare + "%');")
        connection.commit()
        cursor.close()
        connection.close()
    
    def countAllRecords():
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        cursor.execute("SELECT COUNT (*) FROM " + tableName +";")
        connection.commit()
        resultTuple = cursor.fetchall()[0]
        (count,) = resultTuple #unpacking tuple
        cursor.close()
        connection.close()
        return str(count)

    def queryToDataframe(fullQuery):
        connection = sqlite3.connect('results.db')
        cursor = connection.cursor()
        # df = pd.read_sql("SELECT datetimeFirst, datetimeLast FROM " +tableName+ ";", con=connection)
        df = pd.read_sql(fullQuery, con=connection)
        connection.commit()
        # print(cursor.fetchall())
        # print('\n'+str(len(cursor.fetchall())) + ' records found')
        cursor.close()
        connection.close()
        return df
    
database.createTableIfNotExists()
database.countAllRecords()
# database.executeQuery("DROP TABLE" + tableName)

### Scrapping to database

In [None]:
columnsAll = ['datetimeFirst', 'datetimeLast', 'url', 'title', 'salaryAndContract', 'salaryMin', 'salaryMax', 'employer', 'workModes', 'positionLevels', 'offerValidTo', 'location', 'techstackExpected', 'techstackOptional', 'responsibilities', 'requirements', 'optionalRequirements'] # move out of global scope later

import numpy as np

def scrapToDatabase():
    timeDeltas = []
    inserts = 0
    updates = 0
    print(database.countAllRecords() + ' records before run')
    for i in range (0,2):
    # for i in range (len(offers_urls)):
        driver.get(offers_urls[i])
        if not offerNotFound():
            resultsList = getOfferDetails()
            outputDictionary = {}
            for column, offerDetail in zip(columnsAll, resultsList):
                outputDictionary[column] = offerDetail #combine 2 lists into 1 dictionary
            # before = time.time()
            if database.recordFound(driver.current_url):
                database.updateDatetimeLast(driver.current_url)
                # print(driver.current_url)
                updates += 1
            else:
                database.insertRecord(outputDictionary) # insert into databas
                inserts += 1
                # print('insert')
            # timeDeltas.append(time.time() - before)
            #ending here and starting in an above for/zip loop it takes ~(1/100)s - good enough
            print (str(i+1) + '/' + str(len(offers_urls)) + ' done')
        else:
            print('OFFER NOT FOUND: ' +  driver.current_url)
        time.sleep(random.uniform(0.35,0.85)) #Humanize requests frequency
    # print(np.mean(timeDeltas))
    print(str(inserts) + ' inserts | ' + str(updates) + ' updates')

scrapToDatabase()

In [None]:
from flask import Flask, render_template, request, send_file
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.models.widgets import DataTable, TableColumn
from bokeh.models import ColumnDataSource, WheelZoomTool, HTMLTemplateFormatter, HoverTool, TapTool, Range1d, LinearAxis
from bokeh.embed import json_item
from bokeh.io import curdoc #for dark theme
import io #for a csv buffer

def makeBokehPlot(dataframe): #Only offers with specified salary?
    # len(dataframe) >=1 at this point 
    # dataframe already ordered by (salaryMin+SalaryMax)/2 ASC

    pd.options.mode.copy_on_write = True #recommended - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    pd.set_option('future.no_silent_downcasting', True)

    nonNanRowsDf = dataframe[dataframe['salaryMin'].notna()]
    nanRowsDf = dataframe[dataframe['salaryMin'].isna()]

    # SPECIFY UNSPECIFIED BARS HEIGHT
    if len(nonNanRowsDf) > 0: #otherwise division by 0 possible
        lookUpToValues = 2 #how many values to count average
        avgOfNLowestMinSalaries = nonNanRowsDf['salaryMin'].head(lookUpToValues).tolist() #select up to 2 values
        avgOfNLowestMinSalaries = sum(avgOfNLowestMinSalaries) / len(avgOfNLowestMinSalaries) #avg
        avgOfNLowestMaxSalaries = nonNanRowsDf['salaryMax'].head(lookUpToValues).tolist() #select up to 2 values
        avgOfNLowestMaxSalaries = sum(avgOfNLowestMaxSalaries) / len(avgOfNLowestMaxSalaries) #avg
        nanRowsDf['salaryMin'] = nanRowsDf['salaryMin'].fillna(avgOfNLowestMinSalaries) #replace nulls with values
        nanRowsDf['salaryMax'] = nanRowsDf['salaryMax'].fillna(avgOfNLowestMaxSalaries)
    else: #if only unspecified salaries foud
        avgOfNLowestMinSalaries = 4200 #some value to plot
        avgOfNLowestMaxSalaries = 4200
        nanRowsDf['salaryMin'] = nanRowsDf['salaryMin'].fillna(avgOfNLowestMinSalaries) #replace nulls with values
        nanRowsDf['salaryMax'] = nanRowsDf['salaryMax'].fillna(avgOfNLowestMaxSalaries)
        
    dataSalaryUnspecified = {
        'x': nanRowsDf.index.tolist(),
        'title': nanRowsDf['title'].values.tolist(),
        'activeFor': [(dtstr.days) for dtstr in (pd.to_datetime(nanRowsDf["datetimeLast"])-pd.to_datetime(nanRowsDf["datetimeFirst"])).tolist()], #.days shows only days
        'salaryAvg': [((avgOfNLowestMinSalaries+avgOfNLowestMaxSalaries)/2) for i in range (len(nanRowsDf))]
    }
    dataSalarySpecified = {
        'x': nonNanRowsDf.index.tolist(),
        'title': nonNanRowsDf['title'].values.tolist(),
        'activeFor': [(dtstr.days) for dtstr in (pd.to_datetime(nonNanRowsDf["datetimeLast"])-pd.to_datetime(nonNanRowsDf["datetimeFirst"])).tolist()], #.days shows only days
        'salaryMin': nonNanRowsDf['salaryMin'].values.tolist(),
        'salaryMax': nonNanRowsDf['salaryMax'].values.tolist(),
        'salaryAvg': [(a + b) / 2 for a, b in zip(nonNanRowsDf['salaryMin'].values.tolist(), nonNanRowsDf['salaryMax'].values.tolist())],
    }

    #Calculate ranges - SAFE MAX by declaring default values used if empty list
    maxActiveFor = int(max(max(dataSalaryUnspecified['activeFor'], default=0) , max(dataSalarySpecified['activeFor'], default=0))) +1 #
    maxSalary = max(max(dataSalaryUnspecified['salaryAvg'], default=0) , max(dataSalarySpecified['salaryMax'], default=0)) * 1.05

    sourceSalaryUnspecified = ColumnDataSource(dataSalaryUnspecified) #2 data sources
    sourceSalarySpecified = ColumnDataSource(dataSalarySpecified) #2 data sources
    plot = figure(title="", x_axis_label='Offer index', y_axis_label='Salary', height = 400, sizing_mode='stretch_width')
    plot.y_range = Range1d(start=0 - 1, end=maxSalary) # * 1.2 to fit the bars
    plot.x_range = Range1d(start=0 - 1, end=int(len(dataframe))) #too much empty space by default
    plot.extra_y_ranges = {"y2": Range1d(start=0, end=maxActiveFor)} #add 1 day
    #COLORS
    salaryUnspecifiedColor = 'rgb(60,60,160)'
    salarySpecifiedColor = 'rgb(80,80,220)'
    # daysActiveColor = 'rgb(30,150,30)'
    daysActiveColor = 'rgb(60,100,40)'
    # SALARY UNSPECIFIED BARS
    plot.vbar('x', top = 'salaryAvg', width = 0.70, source = sourceSalaryUnspecified, color=salaryUnspecifiedColor, alpha = 1) # MAIN BAR
    plot.vbar('x', top = 'activeFor', y_range_name="y2", source = sourceSalaryUnspecified, color=daysActiveColor, alpha = 0.15, width=0.90) # Active for
    # plot.segment(x0='x', y0='salaryMin', x1='x', y1='salaryMax', source=sourceSalaryUnspecified, line_width=2, color='black', alpha = 0.5) #Error bar
    # SALARY SPECIFIED BARS
    plot.vbar('x', top = 'salaryAvg', width = 0.70, source = sourceSalarySpecified, color=salarySpecifiedColor, alpha = 1) # MAIN BAR
    plot.vbar('x', top = 'activeFor', y_range_name="y2", source = sourceSalarySpecified, color=daysActiveColor, alpha = 0.15, width=0.90) # Active for
    plot.segment(x0='x', y0='salaryMin', x1='x', y1='salaryMax', source=sourceSalarySpecified, line_width=1.5, color='black', alpha=0.75) #Error bar
    
    plot.add_layout(LinearAxis(y_range_name="y2", axis_label="Days adtive"), 'right') # Add the second y-axis to the right
    
    # Configure minor gridlines
    plot.xgrid.minor_grid_line_color = 'rgb(80,80,80)'
    plot.ygrid.minor_grid_line_color = 'rgb(80,80,80)'
    plot.xgrid.minor_grid_line_alpha = 0.5 # Opacity
    plot.ygrid.minor_grid_line_alpha = 0.5

    taptool = TapTool() #highlight on tap
    wheel_zoom = WheelZoomTool()
    plot.toolbar.active_scroll = wheel_zoom
    hoverSalaryUnpecified = HoverTool(tooltips=[("Offer index:", "@x"), ("Job title:", "@title"), ("Salary:", "Unspecified"), ("Active for:", "@activeFor days")])
    hoverSalaryUnpecified.renderers = [plot.renderers[0]]# hover tool only on the salary bars
    hoverSalarySpecified = HoverTool(tooltips=[("Offer index:", "@x"), ("Job title:", "@title"), ("Min/Avg/Max:", "@salaryMin{0.}/@salaryAvg{0.}/@salaryMax{0.}"), ("Active for:", "@activeFor days")]) #{0} = no decimals
    hoverSalarySpecified.renderers = [plot.renderers[2]]# hover tool only on the salary bars
    plot.add_tools(hoverSalarySpecified, hoverSalaryUnpecified, taptool) #wheel_zoom removed for now
    #DARK THEME
    curdoc().theme = 'dark_minimal'
    curdoc().add_root(plot) #to apply the theme
    return plot

def makeBokehTable(dataframe):
    source = ColumnDataSource(dataframe)
    columns = []
    for column in dataframe.columns:
        if column == 'url': # to make a hyperlink
            columns.append(TableColumn(field=column, title=column, formatter=HTMLTemplateFormatter(template="""<a href="<%= value %>" target="_blank"><%= value %></a>""")))
        else:
            columns.append(TableColumn(field=column, title=column))     
    table = DataTable(source=source, columns=columns, height = 800, editable=True, sizing_mode="stretch_width")
    return table

app = Flask(__name__)

@app.route('/downloadCsv')
def downloadCsv():
    # Save the DataFrame to a CSV file
    csvName = "jobScrappingResults " + str(datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")) + ".csv"
    # Save the DataFrame to a CSV in memory
    buffer = io.BytesIO() #buffer for a csv file to avoid saving csv on a disk
    dataframeTable.to_csv(buffer, sep=',', encoding='utf-8-sig', index=True, header=True)
    buffer.seek(0)  # Reset buffer position to the beginning
    # Send the CSV file as a downloadable response
    return send_file(buffer, as_attachment=True, download_name=csvName, mimetype='text/csv')

@app.route('/', methods=['GET', 'POST'])
def form():
    if request.method == 'GET':
        # return render_template("form.html", columnsAll=columnsAll)
        return render_template("app.html", columnsAll=columnsAll, resources=CDN.render())
    
    elif request.method == 'POST':
        def makeFormOutputDictionary():
            formDictFromJson = request.get_json() #get form values from a request
            outputDict = {}
            for column in columnsAll:
                rowDictionary = {'show': False, 'necessary': None, 'forbidden': None, 'above': None, 'below': None}
                #show column
                if formDictFromJson.get(column+'Show', False): #if not found assign False. Found only if form field not empty
                    rowDictionary['show'] = True
                #necessary phrase
                if formDictFromJson.get(column+'Necessary', False):
                    phraseNecessary = formDictFromJson.get(column+'Necessary')
                    # phraseNecessary = phraseNecessary.split(", ") #delete
                    rowDictionary['necessary'] = phraseNecessary
                #forbidden phrase
                if formDictFromJson.get(column+'Forbidden', False):
                    phraseForbidden = formDictFromJson.get(column+'Forbidden')
                    # phraseForbidden = phraseForbidden.split(", ") #delete
                    rowDictionary['forbidden'] = phraseForbidden
                #above
                if formDictFromJson.get(column+'Above', False):
                    rowDictionary['above'] = formDictFromJson.get(column+'Above')
                    # print('found ' + column+'Above') #
                #below
                if formDictFromJson.get(column+'Below', False):
                    rowDictionary['below'] = formDictFromJson.get(column+'Below')
                    # print('found ' + column+'Below') #
                outputDict[column] = rowDictionary #append row with column name as a key
            # print(outputDict)
            return outputDict
        
        def queryBuilder(formDictionary):
            
            def handleBracketsAndLogicalOperators(input, param, like):
                if like:
                    likePart = ' LIKE '
                elif not like:
                    likePart = ' NOT LIKE '
                splittedResults = re.split(r" OR | AND ", input) #split on logic operator
                phrases = []
                for res in splittedResults:
                    res = re.sub(r'\(|\)', '', res) #remove brackets
                    res = re.sub(r'^ +| +$', '', res) #remove spaces at both ends
                    phrases.append(res)
                for phrase in phrases: #make placeholders one by one
                    input = re.sub(phrase, '<<<>>>', input, count=1) #count=1 to only replace the first match. This is needed because phrases content can overlap
                for phrase in phrases: #fill placeholders one by one
                    input = re.sub('<<<>>>', param + likePart + "('%" +phrase+"%')", input, count=1) #only first match
                return input

            querySelectPart = "SELECT "
            queryMainPart = "\nWHERE 1=1" #removing this later
            for columnName in formDictionary.keys():
                currentColumnDictionary = formDictionary[columnName].items()
                for key, value in currentColumnDictionary:
                    # SELECT STATEMENT APPENDING
                    if key == 'show' and value:
                        querySelectPart += columnName + ', '
                    #ABOVE & BELOW 
                    if key == 'above' and value:
                        queryMainPart += "\nAND "+columnName+" > '"+value+"'"
                    if key == 'below' and value:
                        queryMainPart += "\nAND "+columnName+" < '"+value+"'"
                    #NECESSARY PHRASE
                    if key == 'necessary' and value: # if list not empty
                        queryMainPart += "\nAND "+ handleBracketsAndLogicalOperators(value, columnName, like=True)
                    #FORBIDDEN PHRASE
                    if key == "forbidden" and value:
                        queryMainPart += "\nAND "+ handleBracketsAndLogicalOperators(value, columnName, like=False)
            queryMainPart += '\nORDER BY (salaryMin+SalaryMax)/2 ASC, (JULIANDAY(datetimeLast) - JULIANDAY(datetimeFirst)) * 24 * 60 DESC;' #order by

            querySelectPart = re.sub(r", $", '', querySelectPart) #remove ", " from the end
            querySelectPart += " FROM "+tableName # 1=1 to append only ANDs
            queryMainPart = re.sub(r" 1=1\nAND", '', queryMainPart) #remove "1=1\nAND" if at least 1 filter specified
            queryMainPart = re.sub(r"\nWHERE 1=1", '', queryMainPart)# or remove WHERE 1=1 if no filters specified. If specified shouldn't match this regexp
            query = querySelectPart + queryMainPart
            queryPlot = "SELECT datetimeFirst, datetimeLast, title, salaryMin, salaryMax FROM "+ tableName + queryMainPart #2nd query - always select datetimes and salaries for plotting, order by time active and avg salary
            # print('\n'+query+'\n'+queryPlot)
            return query, queryPlot
        
        global dataframeTable #to make it accessible to download at all times
        dataframeTable, dataframePlot = queryBuilder(makeFormOutputDictionary())
        dataframeTable = database.queryToDataframe(dataframeTable)
        dataframePlot = database.queryToDataframe(dataframePlot)

        if len(dataframePlot) > 0 and len(dataframeTable) > 0: #tho their lengths should be equal
            plot = makeBokehPlot(dataframePlot)
            table = makeBokehTable(dataframeTable)
            return json.dumps([json_item(plot), json_item(table), int(len(dataframeTable))])

        return json.dumps(['noResultsFound']) #when no results return a str

if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)#JUPYTER

In [None]:
strr = "theprotocol.it"
print(re.match(r".?"+strr, ".theprotocol.it")) # is r"(www)?.?" too much?