In [19]:
# homemade functions
from linkedin import *

# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# other
import pandas as pd
import time

In [63]:
class Scraper:

    def __init__(self, sec_sleep=0.5):
        self.driver = webdriver.Firefox()
        self.sec_sleep = sec_sleep
        
    def close_browser(self):
        self.driver.quit()

    def scroll_to_bottom(self):
        time.sleep(self.sec_sleep*10)
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".jobs-search-results-list")))
        div_element = self.driver.find_element(By.CSS_SELECTOR, ".jobs-search-results-list")
        self.driver.execute_script("arguments[0].scrollIntoView(true);", div_element)
        self.driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", div_element)

    def load_page(self, url, try_quitting_first=True):
        self.driver.get(url)
        time.sleep(self.sec_sleep*2.5)

    def accept_cookies(self):
        cookies_xpath = '//*[@id="artdeco-global-alert-container"]/div/section/div/div[2]/button[1]'
        cookies_button = self.driver.find_element(By.XPATH, cookies_xpath)
        cookies_button.click()
        time.sleep(self.sec_sleep)

    def login(self, email, password):
        email_field = self.driver.find_element(By.XPATH, '//*[@id="session_key"]')
        email_field.send_keys(email)
        time.sleep(self.sec_sleep)
        
        password_field = self.driver.find_element(By.XPATH, '//*[@id="session_password"]')
        password_field.send_keys(password)
        time.sleep(self.sec_sleep)

        button_signin = self.driver.find_element(By.XPATH, '//*[@id="main-content"]/section[1]/div/div/form/div[2]/button')
        button_signin.click()
        time.sleep(self.sec_sleep)

    def press_enter_and_scroll(self):
        search_field = self.driver.find_element(By.XPATH, "(//*[contains(@id, 'jobs-search-box-location-id-ember')])[last()]")
        search_field.send_keys(Keys.ENTER)
        time.sleep(self.sec_sleep)
        self.scroll_to_bottom()

    def close_message(self):
        message_close = self.driver.find_element(By.XPATH, "(//button[contains(@class, 'msg-overlay-bubble-header__control--new-convo-btn')])[last()]")
        message_close.click()
        time.sleep(self.sec_sleep)

    def enter_keywords(self, keywords):
        search_field = self.driver.find_element(By.XPATH, "(//*[contains(@id, 'jobs-search-box-keyword-id-ember')])[last()]")
        search_field.clear()
        search_field.send_keys(keywords)
        time.sleep(self.sec_sleep)

    def enter_location(self, location):
        search_field = self.driver.find_element(By.XPATH, "(//*[contains(@id, 'jobs-search-box-location-id-ember')])[last()]")
        search_field.clear()
        search_field.send_keys(location)
        time.sleep(self.sec_sleep)

    def get_job_details(self):
            
        # Find all job card elements
        job_cards = driver.find_elements(By.CLASS_NAME, 'job-card-container')
    
        # Initialize lists to store job details
        job_ids = []
        job_titles = []
        companies = []
        locations = []
        descriptions = []
        posted_dates = []
    
        # Iterate over job card elements to extract details
        for card in job_cards:
            job_id = card.get_attribute('data-job-id')
            title_element = card.find_element(By.CSS_SELECTOR, '.job-card-container__link.job-card-list__title')
            company_element = card.find_element(By.CSS_SELECTOR, '.job-card-container__primary-description')
            location_element = card.find_element(By.CSS_SELECTOR, '.job-card-container__metadata-wrapper li')
            date_element = card.find_element(By.XPATH, "//span[@class='tvm__text tvm__text--neutral']/span")
    
            if job_id:
                job_ids.append(job_id)
                job_titles.append(title_element.text if title_element else 'N/A')
                companies.append(company_element.text if company_element else 'N/A')
                locations.append(location_element.text if location_element else 'N/A')
                descriptions.append(card.text if card else 'N/A')
                posted_dates.append(date_element.text if date_element else 'N/A')
    
        # Create a DataFrame
        job_data = pd.DataFrame({
            'Job ID': job_ids,
            'Title': job_titles,
            'Company': companies,
            'Location': locations,
            'Description': descriptions,
            'Date': posted_dates,
        })
    
        return job_data

    def scrap_jobs(self, max_page=100, verbose=False):
        # get job infos from first page
        job_df = get_job_details(driver)
        
        # start with next page
        page = 2
        
        while (True and page < max_page):
        
            # go to next page
            next_page_xpath = f'//button[@aria-label="Page {page}"]'
            try:
                next_page_button = driver.find_element(By.XPATH, next_page_xpath)
                next_page_button.click()
                page += 1
            
            # if next page not found, we stop scrapping and return the dataframe
            except:
                print("Last page, scrapping over")
                return job_df
    
            # wait and scroll down
            scroll_to_bottom()
    
            # get job infos
            new_job_df = get_job_details(driver)
            
            # print verbosity
            if verbose:
                    print(f"Page {page} scrapped.")
                    print(f"Jobs founded: {len(new_job_df)}")
            
            job_df = pd.concat([new_job_df, job_df])
            job_df = job_df.reset_index(drop=True)
            job_df.to_csv('../data/job_df.csv', index=False)

In [49]:
# credentials
email = 'joseph.barbierdarnal@gmail.com'
with open('../credentials.txt', 'r') as file:
    password = file.read()

In [64]:
try: # if a driver is already working, restart by quitting
    scraper.close_br
except:
    pass

scraper = Scraper()
scraper.load_page("https://www.linkedin.com/")
scraper.accept_cookies()
scraper.login(email, password)

In [54]:
load_page(driver, "https://www.linkedin.com/jobs/collections/recommended/")
close_message()

enter_keywords(driver, 'data science')
enter_location(driver, 'European Economic Area')
press_enter_and_scroll()

job_df = scrap_jobs(driver)

Last page, scrapping over


In [56]:
job_df

Unnamed: 0,Job ID,Title,Company,Location,Description,Date
0,3782287068,"Business Intelligence Engineer II, S&OP Automa...",Amazon,"Luxembourg, Luxembourg, Luxembourg","Business Intelligence Engineer II, S&OP Automa...",
1,3775375662,Internship Artificial Intelligence & Computer ...,Korro AI,"Darmstadt, Hesse, Germany (Hybrid)",Internship Artificial Intelligence & Computer ...,
2,3782579723,Consultant Data Science (all genders),Accenture DACH,"Kronberg, Hesse, Germany (On-site)",Consultant Data Science (all genders)\nAccentu...,
3,3781591727,Data Scientist for A320 Efficiency Leader (M/F),Airbus,"Toulouse, Occitanie, France (Hybrid)",Data Scientist for A320 Efficiency Leader (M/F...,
4,3790751316,Data Engineer,'s Heeren Loo,"Amersfoort, Utrecht, Netherlands (On-site)","Data Engineer\n's Heeren Loo\nAmersfoort, Utre...",
...,...,...,...,...,...,...
279,3784864205,Senior Machine Learning Engineer,Ani Biome,"Zagreb, Zagreb, Croatia (On-site)",Senior Machine Learning Engineer\nAni Biome\nZ...,2 weeks ago
280,3784559034,Lecturer in Data/Machine Learning Engineering ...,Breda University of Applied Sciences,"Breda, North Brabant, Netherlands (On-site)",Lecturer in Data/Machine Learning Engineering ...,2 weeks ago
281,3651025688,Product Data Scientist Intern,Criteo,"Paris, Île-de-France, France (Hybrid)","Product Data Scientist Intern\nCriteo\nParis, ...",2 weeks ago
282,3775702364,Senior Analyst,INVL Asset Management,"Vilnius, Vilniaus, Lithuania (Hybrid)",Senior Analyst\nINVL Asset Management\nVilnius...,2 weeks ago


In [44]:
#job_df.head()