In [1]:
# Importing Libraries
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import sent_tokenize
import time
import random
import re
import pandas as pd
import bs4

In [2]:
def get_soup_object(url):
    """
    Function to return the soup object for the input URL
    Input: URL (string)
    output: BeautifulSoup object
    """
    page = requests.get(url, timeout=5)
    soup = BeautifulSoup(page.content)
    return soup


def get_title(soup):
    """
    Function to get the title of the job from the imput soup object
    Input: BeautifulSoup object
    output: Job Title (string)
    """
    return soup.find('h1').text

def get_job_desc(soup):
    """
    Function to return the job description for a given soup object
    Input: BeautifulSoup object
    output: Job Title (list of strings)
    """
    div = soup.find('div', {'class': 'show-more-less-html__markup'})
    sentences = [i.strip() for i in sent_tokenize(text = div.text)]
    return sentences

def get_job_features(soup):
    """
    Function to get some features related to the job
    Input: BeautifulSoup object
    output: features about job (dictionary)
    """
    feat = soup.find('ul', {'class': "description__job-criteria-list"})
    feats = [i.strip() for i in re.split(r'(\n){2}', feat.text)]
    feats = [i for i in feats if len(i)>5]
    feat_dict = {}
    for i in range(0,len(feats)-1,2):
        feat_dict[feats[i]] = feats[i+1]
    return feat_dict

In [3]:
URL = "https://ca.linkedin.com/jobs/view/data-consultant-at-assent-3508970129?refId=HcWTYp%2FyFLQIYu9mofA8sA%3D%3D&trackingId=AWRTSD3zoRSTqWy33f5kMA%3D%3D&trk=public_jobs_topcard-title"
soup = get_soup_object(URL)
str(soup)[:1000]

'<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta content="d_jobs_guest_details" name="pageKey"/>\n<!-- --> <meta content="en_US" name="locale"/>\n<meta data-app-version="2.0.1256" data-browser-id="0a3eb28b-766f-47bd-88fd-1660552ac141" data-call-tree-id="AAX2C6x4Cn8A+bknqSaibQ==" data-disable-jsbeacon-pagekey-suffix="false" data-enable-page-view-heartbeat-tracking="" data-member-id="0" data-multiproduct-name="jobs-guest-frontend" data-page-instance="urn:li:page:d_jobs_guest_details;/AJShBgtTbuJpLOw6azp8Q==" data-service-name="jobs-guest-frontend" id="config"/>\n<link href="https://ca.linkedin.com/jobs/view/data-consultant-at-assent-3508970129" rel="canonical"/>\n<!-- --><!-- -->\n<!-- -->\n<!-- -->\n<meta content="https://ca.linkedin.com/jobs/view/data-consultant-at-assent-3508970129" property="al:android:url"/>\n<meta content="com.linkedin.android" property="al:android:package"/>\n<meta content="LinkedIn" property="al:android:app_name"/>\n<meta content="https://ca.linkedin.com/jobs/vi

### Getting Title of Job Posting

In [4]:
get_title(soup)

'Data Consultant'

### Getting raw text from job description
##### This is the document that we would annotate to get skills out of the job description.

In [5]:
get_job_desc(soup)

['Company DescriptionAssent is the supply chain sustainability management solution for the world’s most responsible, sustainability-focused complex manufacturers.',
 'The #1 business risk for complex manufacturers is hidden.',
 'Most supply chains were never built with sustainability in mind, and most manufacturers do not have deep visibility into their supply chain risks while outsourcing the majority of their production.',
 'Shaped by regulatory experts, customers and suppliers, Assent is the foundation manufacturers need for cross-enterprise sustainability.',
 'At Assent, your work will help bring transparency, sustainability, and fairness to the global community.We’re one of the fastest-growing technology companies - from $5 million to $50 million ARR in under five years!',
 "As the global leader in supply chain sustainability management, we provide transparency into the supply chains of some of the world's largest and most successful companies, including 75 percent of Fortune 500 

### Getting raw features related to Job description
##### These are just some extra features (not to be confused with annotation, we are manually annotating skills required for the job)

In [6]:
get_job_features(soup)

{'Seniority level': 'Mid-Senior level',
 'Employment type': 'Full-time',
 'Job function': 'Consulting',
 'Industries': 'Software Development'}

### Getting links for job profiles

In [7]:
def get_job_url_list(soup):
    """
    Function to get a list of job posting URLs for a given soup object (this soup object is for the LinkedIn job search page)
    Input: BeautifulSoup object
    output: list of urls (string)
    """
    job_urls = []
    # getting all anchor tags on a page
    for anchor_tag in soup.find_all('a', href=True):
        # job urls have a cetain format, so only filtering based on that
        if "linkedin.com/jobs/view/" in anchor_tag['href']:
            job_urls.append(anchor_tag['href'])
    return job_urls

def create_urls(searches, locations):
    """
    Function to get a list of linkedin URLs web scraping
    Input: seaches {the search that we make in the linkedin search box}
           locations {all locations for which we want job posting data}
    output: list of urls (string)
    """
    urls = []
    for search in searches:
        keywords = '%20'.join(search.split())
        for location in locations:
            url = f'https://www.linkedin.com/jobs/search/?keywords={keywords}&location={location}'
            urls.append(url)
    return urls
        

#### Define the search parameters for LinkedIn job search

In [16]:
searches = ['data science', 
            'machine learning', 
            'NLP', 
            'software engineer', 
            'database developer', 
            'full stack developer', 
            'cloud engineer', 
            'web developer', 
            'cybersecurity']
locations = ['San%20Francisco','Los%20Angeles','San%20Jose','San%20Diego','New%20York','Texas', 'Arizona', 'Canada', 'Ireland','England']
linkedin_search_urls = create_urls(searches, locations)
linkedin_search_urls

['https://www.linkedin.com/jobs/search/?keywords=data%20science&location=San%20Francisco',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=Los%20Angeles',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=San%20Jose',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=San%20Diego',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=New%20York',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=Texas',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=Arizona',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=Canada',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=Ireland',
 'https://www.linkedin.com/jobs/search/?keywords=data%20science&location=England',
 'https://www.linkedin.com/jobs/search/?keywords=machine%20learning&location=San%20Francisco',
 'https://www.linkedin.com/jobs/search/?keywords=machi

#### We have created LinkedIn search URLs above, we would use these to get the URLs for individual job posts

We normally get around 25 job posts for each URL

In [17]:
%%time
job_post_urls = set()
for url in linkedin_search_urls:
    soup = get_soup_object(url)
    urls = get_job_url_list(soup)
    job_post_urls.update(urls)
    # sleep randomly for 1 to 2 sec
    time.sleep(random.random()+random.random())
print(f'We got {len(job_post_urls)} job URLs')
[i for i in job_post_urls][:3]

We got 2249 job URLs
CPU times: total: 14.2 s
Wall time: 2min 52s


['https://www.linkedin.com/jobs/view/database-developer-at-cemco-llc-3508915401?refId=q15GbJidwNGYlwEguR8CJQ%3D%3D&trackingId=R62IFhP%2BofvLpERb0gLJXw%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/software-engineer-i-full-time-united-states-at-cisco-3427893305?refId=HdP237pv%2BNZgcLApiKUemA%3D%3D&trackingId=%2Fzk5dKVtBjy5yufTTmvihw%3D%3D&position=17&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/penetration-tester-network-cloud-application-usds-at-tiktok-3499521742?refId=B7ItTGeGAx1DGjYnwzGibA%3D%3D&trackingId=5%2F49%2FiUKpH06qQTCZyTzdQ%3D%3D&position=23&pageNum=0&trk=public_jobs_jserp-result_search-card']

### Let's get the actual job data for each job URL

In [18]:
%%time
urls = []
titles = []
descriptions = []
seniorities = []
employment_types = []
for url in job_post_urls:
    try:
        soup = get_soup_object(url)
        job_title = get_title(soup)
        job_description = get_job_desc(soup)
        job_features = get_job_features(soup)
        urls.append(url)
        titles.append(job_title)
        descriptions.append(job_description)
        if 'Seniority level' in job_features:
            seniorities.append(job_features['Seniority level'])
        else:
            seniorities.append(None)
        if 'Employment type' in job_features:
            employment_types.append(job_features['Employment type'])
        else:
            employment_types.append(None)
    except:
        print('Failed for:')
        print(url)
    # sleep randomly for 2 to 3 sec
    time.sleep(random.random()+random.random()+random.random())

Failed for:
https://www.linkedin.com/jobs/view/looking-for-google-cloud-data-engineer-cloud-engineer-san-diego-ca-full-time-at-extend-information-systems-inc-3494565712?refId=wNTeYfS1G7NlMJnr%2Fk%2BWwg%3D%3D&trackingId=6oBXPeMBRKkx7SBJp2yOzw%3D%3D&position=5&pageNum=0&trk=public_jobs_jserp-result_search-card
Failed for:
https://www.linkedin.com/jobs/view/front-end-developer-at-diverse-lynx-3480579558?refId=9QREhi1%2Fy0IWoKqeLW6LNg%3D%3D&trackingId=3yHaGLt5LUotgXEKiLqRMQ%3D%3D&position=21&pageNum=0&trk=public_jobs_jserp-result_search-card
Failed for:
https://www.linkedin.com/jobs/view/application-developer-c%23-fullstack-developer-at-sunsoft-online-3507435934?refId=Mc5mmprGEcMlIUYxD%2Fou8Q%3D%3D&trackingId=xRmiqLNpK6uBuE5v5%2Bp6OA%3D%3D&position=21&pageNum=0&trk=public_jobs_jserp-result_search-card
CPU times: total: 5min 26s
Wall time: 1h 14min 53s


In [19]:
df = pd.DataFrame({
    'url': urls,
    'title': titles,
    'job_description': descriptions,
    'seniority_level': seniorities,
    'employment_type': employment_types
})

df

Unnamed: 0,url,title,job_description,seniority_level,employment_type
0,https://www.linkedin.com/jobs/view/database-de...,Database Developer,[SummaryThe Database Developer is part of the ...,Entry level,Full-time
1,https://www.linkedin.com/jobs/view/software-en...,Software Engineer I (Full Time) United States,[What You’ll DoOur software engineers are the ...,Not Applicable,Full-time
2,https://www.linkedin.com/jobs/view/penetration...,Penetration Tester (Network/Cloud/Application)...,[Responsibilities About TikTokTikTok is the le...,Not Applicable,Full-time
3,https://www.linkedin.com/jobs/view/sql-develop...,SQL Developer,"[SQL Developer - Long Beach, CA - Infosys Nee...",Entry level,Contract
4,https://ca.linkedin.com/jobs/view/cyber-securi...,Cyber Security Specialist,[As one of Canada’s largest and fastest growin...,Not Applicable,Full-time
...,...,...,...,...,...
2241,https://www.linkedin.com/jobs/view/sql-server-...,SQL Server Developer,[Job Description Job Title: SQL Server Develop...,Entry level,Contract
2242,https://www.linkedin.com/jobs/view/embedded-so...,Embedded Software Engineer,[Summary Apple's embedded firmware/software te...,Not Applicable,Full-time
2243,https://uk.linkedin.com/jobs/view/cloud-engine...,Cloud Engineer,[Great opportunity for a Cloud Engineer to joi...,Mid-Senior level,Full-time
2244,https://www.linkedin.com/jobs/view/cloud-engin...,Cloud Engineer,"[Job DescriptionCloud EngineerSan Jose, CAW2 C...",Entry level,Contract


In [20]:
def get_job_id(url):
    ind = url.index('?')
    url = url[:ind]
    url = url[::-1]
    url = url[:url.index('-')]
    return url[::-1]
    
df['url'].apply(get_job_id)

0       3508915401
1       3427893305
2       3499521742
3       3349162441
4       3505332417
           ...    
2241    3511024967
2242    3306867444
2243    3508268320
2244    3511125113
2245    3511377496
Name: url, Length: 2246, dtype: object

In [21]:
df.to_csv('linkedin_data.csv', )

In [22]:
sum(df['job_description'].apply(lambda x: len(' '.join(x).split())).values)/len(df)

460.6099732858415

In [23]:
sum(df['job_description'].apply(lambda x: len(' '.join(x).split())).values)

1034530

In [31]:
df

Unnamed: 0,url,title,job_description,seniority_level,employment_type
0,https://www.linkedin.com/jobs/view/database-de...,Database Developer,[SummaryThe Database Developer is part of the ...,Entry level,Full-time
1,https://www.linkedin.com/jobs/view/software-en...,Software Engineer I (Full Time) United States,[What You’ll DoOur software engineers are the ...,Not Applicable,Full-time
2,https://www.linkedin.com/jobs/view/penetration...,Penetration Tester (Network/Cloud/Application)...,[Responsibilities About TikTokTikTok is the le...,Not Applicable,Full-time
3,https://www.linkedin.com/jobs/view/sql-develop...,SQL Developer,"[SQL Developer - Long Beach, CA - Infosys Nee...",Entry level,Contract
4,https://ca.linkedin.com/jobs/view/cyber-securi...,Cyber Security Specialist,[As one of Canada’s largest and fastest growin...,Not Applicable,Full-time
...,...,...,...,...,...
2241,https://www.linkedin.com/jobs/view/sql-server-...,SQL Server Developer,[Job Description Job Title: SQL Server Develop...,Entry level,Contract
2242,https://www.linkedin.com/jobs/view/embedded-so...,Embedded Software Engineer,[Summary Apple's embedded firmware/software te...,Not Applicable,Full-time
2243,https://uk.linkedin.com/jobs/view/cloud-engine...,Cloud Engineer,[Great opportunity for a Cloud Engineer to joi...,Mid-Senior level,Full-time
2244,https://www.linkedin.com/jobs/view/cloud-engin...,Cloud Engineer,"[Job DescriptionCloud EngineerSan Jose, CAW2 C...",Entry level,Contract
