# Imports

In [95]:
import requests
import pickle
import h5py
from selenium import webdriver
from time import sleep
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from datetime import timedelta, date
from bs4 import BeautifulSoup
from tqdm.contrib.concurrent import process_map  # or thread_map
import unidecode


# WebDriver

In [315]:
def scroll_job_results():
    """
    Scrolls the side bar containing the job results and wait for loading
    """
    #Scroll down to load all jobs
    job_results_side_bar = driver.find_element_by_class_name('jobs-search-results')
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight/4', job_results_side_bar)
    sleep(0.5)
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight/2', job_results_side_bar)
    sleep(0.5)
    driver.execute_script('3*(arguments[0].scrollTop = arguments[0].scrollHeight)/4', job_results_side_bar)
    sleep(0.5)
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', job_results_side_bar)
    sleep(0.5)
    
def get_job_links(page_source,  most_recent_saved = None):
    """
    Get the htlm content of the page and scrap the link toward job pages.
    `most_recent_saved` is the most recent job previously saved
    """
    # get the html content
    soup = BeautifulSoup(page_source, 'html.parser')
    #get the job page links
    job_raw_links = soup.find_all("a", class_="disabled ember-view job-card-container__link job-card-list__title")
    #proper format for links
    job_links = ['https://www.linkedin.com' + l.attrs["href"] for l in job_raw_links]
    
    if most_recent_saved in job_links:
        idx_most_recent = job_links.index(most_recent_saved)
        return job_links[:idx_most_recent] 
    else:
        return job_links

def load_job_links():
    """
    Loads the list of job links previouly scrapped
    """
    try:
        return pickle.load(open('./Data/job_links.p', 'rb'))
    except FileNotFoundError:
        return []
    
def load_lastest_job_saved():
    
    job_saved = load_job_links()
    if job_saved:
        return job_saved[0]
    else:
        return None

def save_job_links(links):
    """
    Loads the list of job links previouly scrapped
    """
    try:
        job_list_old =  pickle.load(open('./Data/job_links.p', 'rb'))
        #appd the old list, most recent on top
        job_list_augmented = list(set(links+job_list_old ))
        pickle.dump(job_list_augmented, open('./Data/job_links.p', 'wb'))
    except FileNotFoundError:
        pickle.dump(links , open('./Data/job_links.p', 'wb'))
        
def save_job_page(new_df):
    """
    Loads the list of job links previouly scrapped
    """
    try:
        job_df_old =  pickle.load(open('./Data/df.p', 'rb'))
        #appd the old list, most recent on top
        job_df_augmented = job_df_old.append(new_df)
        pickle.dump(job_df_augmented, open('./Data/df.p', 'wb'))
    except FileNotFoundError:
        pickle.dump(new_df , open('./Data/df.p', 'wb'))

In [183]:
driver = webdriver.Chrome()
LINKEDIN_WEB = 'https://www.linkedin.com/jobs/search/?location=Switzerland&sortBy=DD'

# Access Linkedin and connect–
driver.get(LINKEDIN_WEB)

log_in = driver.find_element_by_xpath('/html/body/header/nav/div/a[2]')
log_in.click()

email_in = driver.find_element_by_xpath('//*[@id="username"]')
email_in.click()
email_in.send_keys('prost.jb@gmail.com')

pass_w = driver.find_element_by_xpath('//*[@id="password"]')
pass_w.click()
pass_w.send_keys('Labalme1728')

# click log in button
driver.find_element_by_xpath('//*[@id="app__container"]/main/div[3]/form/div[3]/button').click()

In [316]:
driver = webdriver.Chrome()
LINKEDIN_WEB = 'https://www.linkedin.com/jobs/search/?location=Switzerland&sortBy=DD'

# Access Linkedin and connect–
driver.get(LINKEDIN_WEB)

log_in = driver.find_element_by_xpath('/html/body/header/nav/div/a[2]')
log_in.click()

email_in = driver.find_element_by_xpath('//*[@id="username"]')
email_in.click()
email_in.send_keys('prost.jb@gmail.com')

pass_w = driver.find_element_by_xpath('//*[@id="password"]')
pass_w.click()
pass_w.send_keys('Labalme1728')

# click log in button
driver.find_element_by_xpath('//*[@id="app__container"]/main/div[3]/form/div[3]/button').click()

#pause for connection
sleep(2)
#scrolls down to load all 25 jobs displayed on the page
scroll_job_results()

most_recent = load_lastest_job_saved()
#get the job page links
job_links = get_job_links(driver.page_source, most_recent)

# next pages ...
for job_increment in tqdm(range(25, 1000, 25)):
    driver.get(LINKEDIN_WEB + "&start={}".format(job_increment))
    scroll_job_results()
    
    #get the job page links
    job_links_tmp = get_job_links(driver.page_source,  most_recent)
    job_links += job_links_tmp
    
    if len(job_links_tmp) < 25:
        print('All available jobs scrapped')
        break
    
    sleep(1)
driver.close()

  0%|          | 0/39 [00:07<?, ?it/s]

All available jobs scrapped





In [319]:
# save the new links
save_job_links(job_links)
# load the old-saved links 
#job_links = load_job_links()
print('{} jobs to inspect'.format(len(job_links)))

4374 jobs to inspect


In [320]:
class LinkedInScrapper:
    
    def __init__(self, job_links):
        
        self.job_links = job_links
        self.to_process_job_links = job_links.copy()
        self.failed_job_links = []
        self.df = pd.DataFrame()
    
    #@abstractmethod
    def get_job_date(self, html_soup):
        """
        Get the posting date of the job. 
        Best accuracy is at the day scale, then week, then month

        return a datetime.date type
        """
        time_raw= html_soup.find("span", 
                              class_ = "topcard__flavor--metadata posted-time-ago__text")

        # if job is defined as `NEW`     
        if not time_raw:
            time_raw= html_soup.find("span", 
                              class_ = "topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new")

        time_raw = time_raw.text.split()[:2]
        time_scale = time_raw[1]

        today = ['minutes', 'minute', 'hour', 'hours', 'seconds']
        if time_scale in today:
            job_date = date.today()
        elif (time_scale == 'day') or (time_scale == 'days'):
            job_date = date.today() + timedelta(days = int(time_raw[0]))
        elif (time_scale == 'week') or (time_scale == 'weeks'):
            job_date = date.today() + timedelta(weeks = int(time_raw[0]))
        elif (time_scale == 'month') or (time_scale == 'months'):
            job_date = date.today() + timedelta(weeks = 4*int(time_raw[0]))
        else:
            print(time_raw)
            raise ValueError

        # has to convert to str for multiprocessing
        return job_date.strftime('%d/%m/%Y') 
    
    def format_city(self, city_str):
        """
        Put the city string in the appropriate format
        """
        city_str = unidecode.unidecode(city_str)

        if len(city_str.split())==2 :
            composed_str = city_str.split()
            first_str = composed_str[0]
            sec_str = composed_str[1]
            
            if first_str == 'St' or first_str=='Saint' or first_str=='Sankt':
                return 'St. ' + sec_str
        # specific cases - frequent mistakes
        if city_str == 'Geneva':
            return 'Geneve'
        elif city_str == 'Lucerne':
            return 'Luzern'
        elif city_str == 'Biel' or city_str == 'Bienne':
            return 'Biel/Bienne'
        elif city_str == 'Berne':
            return 'Bern'
        elif city_str == 'Schlatt (Zurich)':
            return 'Zurich'
        else:
            return city_str
    
    #@abstractmethod
    def get_job_characteristic(self, html_soup):
        """
        Get the HTML file and extract job information. Store it in a dict format
        to late append to a pd.DataFrame
        """
        job_charac = {}
        job_charac['title'] = html_soup.h1.text
        company_span = html_soup.find("span", class_ ='topcard__flavor')
        job_charac['company'] = company_span.text
        
        if company_span.a: # if the company name is clickable
            
            location = html_soup.find_all("span",class_ ='topcard__flavor')[1]\
                                                .text.split(', ')
            if location :
                if len(location) > 2:
                    job_charac['city'] = self.format_city(location[0])
                    job_charac['canton'] =  unidecode.unidecode(location[1])
                    job_charac['country'] = location[2]

                elif len(location) == 1:
                    location = location.split()
                    if len(location) > 1:
                        # Greater XXX Area
                        if location[0] == 'Greater':
                            job_charac['city'] = location[1]
                        else:# XXX Metropolitan Area
                            job_charac['city'] = location[0]
                    else:
                        job_charac['country'] = 'Switzerland'
                else : 
                    job_charac['city'] = self.format_city(location[0])
                    job_charac['country'] = location[1]
            else:
                print(job_charac['title'],job_charac['company'])
        else:
            job_charac['city']  = 'unknown'
            
        job_charac['date'] = self.get_job_date(html_soup)
        job_charac['content'] = html_soup.find("div", 
                                               class_ ="description__text description__text--rich").prettify()


        for subheader in html_soup.find_all("h3", class_ ='job-criteria__subheader'):
            job_charac[subheader.text] = [x.text for x in subheader.find_next_siblings("span")]

        job_charac['Seniority level'] = job_charac['Seniority level'][0]
        job_charac['Employment type'] = job_charac['Employment type'][0]


        return job_charac
       
    def scrap_job_page(self, job_link):
        """
        Get a web link, access the page and scrap its content
        """
        job_page_html = requests.get(job_link).text
        html_soup = BeautifulSoup(job_page_html, 'html.parser')
        
        try:
            scrapped = self.get_job_characteristic(html_soup)
            self.to_process_job_links.remove(job_link)
        except:
            self.failed_job_links.append(job_link)
            print('Error', job_link)
            scrapped = {}
        
        return scrapped
    
    def scrap_all_list(self, use_ultiprocessing=True):
        
        if use_ultiprocessing:
            job_pages = process_map(self.scrap_job_page, self.to_process_job_links,
                        max_workers= 4,
                        chunksize = 1)
        else:
            job_pages = []
            for job in tqdm(self.to_process_job_links):
                job_pages.append(self.scrap_job_page(job))
        
        print('{} Jobs failed to be parsed'.format(len(self.failed_job_links)))
        return job_pages

In [321]:
LiS = LinkedInScrapper(job_links)
job_pages = LiS.scrap_all_list(True)

HBox(children=(FloatProgress(value=0.0, max=4374.0), HTML(value='')))

Error https://www.linkedin.com/jobs/view/2227368104/?eBP=NotAvailableFromMidTier&recommendedFlavor=SCHOOL_RECRUIT&refId=c77a841b-592e-4c77-87b4-2ec480c7fff5&trackingId=%2B%2FtVpZNdkrhSDvyC8KbKpw%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2222648954/?eBP=NotAvailableFromMidTier&recommendedFlavor=PRE_SCREENING_QUESTIONS&refId=2abe44fe-289e-44fa-b438-bec96213f256&trackingId=yx99CtTSrzGjIyc46dqnDA%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2222440743/?eBP=NotAvailableFromMidTier&recommendedFlavor=IN_NETWORK&refId=9991d4ce-8ab8-4a5f-b3c7-94a066b63414&trackingId=Jp6e3YzDGOG3N6Idj34jYg%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2222447057/?eBP=NotAvailableFromMidTier&recommendedFlavor=SCHOOL_RECRUIT&refId=19836fa2-052b-41dc-86ff-07ddc9f79401&trackingId=xn1PZzWf8ObGvx9VEsm%2BSg%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2253926151/?eBP=NotAvailableFrom

Error https://www.linkedin.com/jobs/view/2249751470/?eBP=NotAvailableFromMidTier&recommendedFlavor=SCHOOL_RECRUIT&refId=b7f398f9-2f52-44ac-a54a-78921de40c40&trackingId=FhJcJrazfP1x6na9%2BF%2FkYA%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2227334112/?eBP=NotAvailableFromMidTier&recommendedFlavor=JOB_SEEKER_QUALIFIED&refId=81b1d29d-4ede-4df1-aaba-ca2539bcaf03&trackingId=Z1CfB3bHZ%2Fp44HsJGkLpKA%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2222444571/?eBP=NotAvailableFromMidTier&recommendedFlavor=ACTIVELY_HIRING_COMPANY&refId=19836fa2-052b-41dc-86ff-07ddc9f79401&trackingId=uPHLCi2AicMW%2FL%2B6mjAW6g%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2227364280/?eBP=NotAvailableFromMidTier&refId=b55fb122-9247-4533-929a-a6f617086ade&trackingId=%2Bh6VrhKznbEhpKeqUfXgww%3D%3D&trk=flagship3_search_srp_jobs 1
Error https://www.linkedin.com/jobs/view/2253640993/?eBP=NotAvailableFromMidTier&recommend

In [328]:
df = pd.DataFrame(job_pages)
df.dropna(how='all' , inplace=True)
df = df.join(pd.get_dummies(df.Industries.apply(pd.Series).stack()).\
             sum(level=0)).drop(columns=['Industries'])

In [329]:
canton_code = {'Zurich' : 'ZH',
                'Bern': 'BE',
                'Berne' : 'BE',
                'Luzern' : 'LU',
                'Lucerne' : 'LU',
                'Uri' : 'UR',
                'Schwyz' : 'SZ',
                'Obwalden' : 'OW',
                'Nidwalden' : 'NW',
                'Glarus' : 'GL',
                'Zug' : 'ZG',
                'Fribourg' : 'FR',
                'Freiburg' : 'FR',
                'Solothurn' : 'SO',
                'Basel' : 'BS',
                'Basel-Stadt' : 'BS',
                'Basel-Landschaft' : 'BL',
                'Basel-Country' : 'BL', 
                'Schaffhausen' : 'SH',
                'Appenzell Ausserrhoden' : 'AR',
                'Appenzell Innerrhoden' : 'AI',
                'Appenzell Outer-Rhoden' : 'AR',
                'Appenzell Inner-Rhoden' : 'AI',
                'St. Gallen' : 'SG',
                'St Gallen' : 'SG',
                'Graubunden': 'GR',
                'Grigioni': 'GR',
                'Grischun' : 'GR',
                'Aargau' : 'AG',
                'Thurgau' : 'TG',
                'Ticino' : 'TI',
                'Vaud' : 'VD',
                'Valais': 'VS',
                'Wallis' : 'VS',
                'Neuchatel' : 'NE',
                'Geneve' : 'GE',
                'Geneva' : 'GE',
                'Jura' : 'JU'}
for job in tqdm(df.index) :
    try : 
        df.loc[job,'canton'] = canton_code[df.loc[job,'canton']]
    except KeyError:
        df.loc[job,'canton'] = 'unknown'

100%|██████████| 4325/4325 [00:01<00:00, 2368.64it/s]


In [330]:
df.head()

Unnamed: 0,title,company,city,canton,country,date,content,Seniority level,Employment type,Job function,...,Sports,Staffing and Recruiting,Telecommunications,Tobacco,Transportation/Trucking/Railroad,Utilities,Venture Capital & Private Equity,Veterinary,Warehousing,Wholesale
0,Onboarding Project Manager,Yokoy,Zurich,ZH,Switzerland,08/11/2020,"<div class=""description__text description__tex...",Entry level,Full-time,[Project Management],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,IT-Consultant (m/w) als Business Analyst Public,adesso Schweiz AG,Bern,BE,Switzerland,08/11/2020,"<div class=""description__text description__tex...",Associate,Full-time,[Information Technology],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Heizungsmonteur/in,G&L Partner AG,,unknown,,09/11/2020,"<div class=""description__text description__tex...",Entry level,Full-time,"[Management, Manufacturing]",...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Programmverantwortliche*n Partizipation, 70%",Pro Juventute,Zurich,ZH,Switzerland,09/11/2020,"<div class=""description__text description__tex...",Not Applicable,Full-time,"[Engineering, Information Technology]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Personalbereichsleiter/in 80-100%,Real Personal Werner Blumer AG,Zurich,ZH,Switzerland,09/11/2020,"<div class=""description__text description__tex...",Not Applicable,Full-time,[Other],...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [326]:
 save_job_page(df)

In [29]:
df = pickle.load(open('./Data/df.p', 'rb'))

# Single page inspection

In [None]:
process_map()

In [327]:
pickle.dump(df , open('./Data/df.p', 'wb'))

In [297]:
job_link ='https://www.linkedin.com/jobs/view/2201840333/?alternateChannel=search&refId=13f113ec-812b-41cf-bf9a-8b07c35743e0&trackingId=AAy3Z227vjz4BTQ0FStZOQ%3D%3D'
job_page_html = requests.get(job_link).text
html_soup = BeautifulSoup(job_page_html, 'html.parser')

In [298]:
html_soup.prettify()

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta content="d_jobs_guest_details" name="pageKey"/>\n  <meta content="en_US" name="locale"/>\n  <meta data-app-id="com.linkedin.jobs-guest-frontend.d_web" data-custom-tracking-code="" data-tracking-page-type="" id="config">\n   <link href="https://ch.linkedin.com/jobs/view/praktikum-online-marketing-manager-sem-seo-affiliate-content-at-digital-minds-2201840333?refId=13f113ec-812b-41cf-bf9a-8b07c35743e0" rel="canonical"/>\n   <meta content="https://ch.linkedin.com/jobs/view/praktikum-online-marketing-manager-sem-seo-affiliate-content-at-digital-minds-2201840333?refId=13f113ec-812b-41cf-bf9a-8b07c35743e0" property="al:android:url"/>\n   <meta content="com.linkedin.android" property="al:android:package"/>\n   <meta content="LinkedIn" property="al:android:app_name"/>\n   <meta content="https://ch.linkedin.com/jobs/view/praktikum-online-marketing-manager-sem-seo-affiliate-content-at-digital-minds-2201840333?refId=13f113ec-812b-41cf-bf9a-8b07c

# Old

In [None]:
time_limit_bt.click()
time_limit_bt = driver.find_element_by_xpath('/html/body/header/section/form/ul/li[2]/div/button')
time_limit_bt.click()
last_month =  driver.find_element_by_xpath('//*[@id="TIME_POSTED-dropdown"]/fieldset/div[1]/ul/li[3]/label')
last_month.click()
end_time_filtering = driver.find_element_by_xpath('//*[@id="TIME_POSTED-dropdown"]/fieldset/div[2]/button')
end_time_filtering.click()

In [None]:
df = pd.DataFrame(columns=['title', 'company', 'location', 'date', 'content',
                          'Seniority level','Employment type','Job function',
                           'Industries'], dtype=object).\
        astype({'title': str, 
                'company': str, 
                'location': object,
                'date' : object,
                'content' : str,
                'Seniority level' : str,
                'Employment type': object,
                'Job function' : object,
                'Industries' : object})



#df = df.append(get_job_characteristic(soup), ignore_index=True)