# Data Scraping

## Load Libraries 

In [3]:
import pandas as pd
import numpy as np
import csv
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import re
from unidecode import unidecode

## Pre-compilation
Start selenium, assign data to variables (password login), create df

In [5]:
path = "chromedriver.exe"
driver = webdriver.Chrome(path)
username = ""
password = ""
df = pd.DataFrame(columns = ['Company Name', 'Title','Role','Role url','Localisation','Type','Level','Description'])
wait=WebDriverWait(driver,20)

In [None]:
## Logging in

In [18]:
def login(login, password):
    driver.get('https://www.linkedin.com/checkpoint/rm/sign-in-another-account?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')
    time.sleep(3)
    driver.find_element(by=By.ID,value="username").send_keys(username)
    time.sleep(3)
    driver.find_element(by=By.ID,value="password").send_keys(password)
    time.sleep(3)
    driver.find_element(by=By.CSS_SELECTOR,value=".login__form button").click()

## Scrolling on the page
Because linkedin contains a security feature that prevents the entire page of listings from loading, in order to see all the listings on a page (number of listings per page 25) you have to scroll to the bottom of a certain block and this feature makes it possible to load the entire page.

In [19]:
def scroll_down(driver):
    zx=driver.find_element(by=By.CSS_SELECTOR,value=".jobs-search-results-list")
    for i in range(17):
        if i%2==1:
            time.sleep(2)
        driver.execute_script(f"arguments[0].scrollTop = {i*200+200};", zx)

## Role check
This consists of the scraping function taking the block name (offer title) e.g. "Data Scientist company name" sending the offer title to this function and checking if the offer is in the key or not if it is not in the key it returns a value of False equivalent to not having to click on the block thus saving time on compilation and space as the data is ignored.

In [20]:
def role_checker(role):
    #data scientist
    if bool(re.search(".*[Dd]ata\s[Ss]cien.*",role.lower())): 
        return True,"Data Scientist"
    elif bool(re.search(".*[Mm]achine\s[Ll]earn.*",role.lower())):
        return True, "Machine Learning"
    elif bool(re.search(".*[Dd]ata\s[Aa]nal.*", role.lower())):
        return True, "Data Analyst"
    elif bool(re.search(".*\s[Aa]rtificial\s[Ii]ntelligence\s.*", role.lower())) or bool(re.search(".*\s[Aa][Ii]\s.*", role.lower())):
        return True, "A.I."
    else:
        return False, None

## Check work type
This function checks the type of work full-time remote is included because some companies do not provide this information, which could result in incorrect data being loaded, thus allowing time for cleaning.

In [21]:
def requirements_checker(requirements):
    if bool(re.search(".*Pe.ny\setat.*",requirements)):
        if bool(re.search(".*·.*",requirements)):
            return [unidecode(x.strip()) for x in requirements.split("·")]
        else:
            return [unidecode(requirements),None]
        
    elif bool(re.search(".*Niepe.ny\setat.*",requirements)):
        if bool(re.search(".*·.*",requirements)):
            return [unidecode(x.strip()) for x in requirements.split("·")]
        else:
            return [unidecode(requirements),None]
    elif bool(re.search(".*Zlecenie.*",requirements)):
        if bool(re.search(".*·.*",requirements)):
            return [unidecode(x.strip()) for x in requirements.split("·")]
        else:
            return [unidecode(requirements),None]
    else:
        return [None,None]

## Scraping the data
the algorithm opens the uploaded link scrapes the number of offers so that they all load and one by one enters the corresponding blocks that have a value of true according to the key contained in the role_checker function. If the value is True the program scrapes the data from the url link up to the offer description assigning these values to our df

In [22]:
def search(url):
    time.sleep(1)
    #driver.get("https://www.linkedin.com/jobs/search/?keywords=data%20science")
    for _ in range(40):
        url=f"{url}&start={25*_}" # create url (25 max offers per page)
        driver.get(url)
        time.sleep(4)
        scroll_down(driver)
        block = driver.find_elements(by=By.CSS_SELECTOR,value=".job-card-list") # List of jobs panels
        tit=driver.find_elements(by=By.CSS_SELECTOR,value='.artdeco-entity-lockup__title') # all job panels
        print(len(block))
        for j in range(len(block)):
            time.sleep(3)
            title=tit[j].text
            add_bool,role=role_checker(title)
            if add_bool:
                i=block[j]
                try:
                    i.click() # entry specific offert
                    time.sleep(3)
                    print(1)
                    panel=driver.find_element(by=By.CSS_SELECTOR,value=".jobs-search__job-details") # offert detail
                    time.sleep(3) # taking all information
                    company_name=panel.find_element(by=By.CSS_SELECTOR,value=".jobs-unified-top-card__company-name").text
                    time.sleep(1)
                    position_url= panel.find_element(by=By.CLASS_NAME,value="ember-view").get_attribute("href")
                    time.sleep(1)
                    localisation=driver.find_element(by=By.XPATH,value='//*[@id="main"]/div/section[2]/div/div[2]/div[1]/div/div[1]/div/div[1]/div[1]/div[1]/span[1]/span[2]').text
                    time.sleep(1)
                    position=requirements_checker(panel.find_element(by=By.CSS_SELECTOR,value=".jobs-unified-top-card__job-insight").text)
                    time.sleep(1)
                    description=panel.find_element(by=By.CSS_SELECTOR,value=".jobs-description").text
                    df.loc[len(df)]=[company_name, title, role, position_url, localisation,
                                     position[0],position[1],description]
                        #print(f"company name:{company_name}\ntitle:{title}\nrole: {role}\nurl: {position_url}\nlokalizacja:{localisation}\npozycja:{position}")
                        #print(f"\n\ndescription:\n{description}")
                    print(2)
                except:
                    print("error")

In [23]:
login(username,password)

In [1]:
search("https://www.linkedin.com/jobs/search/?currentJobId=3309502750&distance=25&f_E=1%2C2&geoId=103644278&keywords=data%20scientist")

In [25]:
df.to_csv("jobs_usa.csv", index=False) # zapis w pliku csv

In [26]:
driver.quit()