In [1]:
import json
import math
import time
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

In [2]:
def scrap_jobs_details_for_page(content, keyword, pages, start, url, jobs_data):
    # set the headers
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    
    # find all jobs present on the page
    jobs = content.find_all('div', class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')
    # jobs = content.find_all('ul', class_='jobs-search__results-list')
    # print(len(jobs))
    
    # iterate through jobs found on the page
    for job in jobs:
        # job_title = job.find('h3', class_='base-search-card__title').text.strip()
        company = job.find('h4', class_='base-search-card__subtitle').text.strip()
        job_link = job.find('a', class_='base-card__full-link')['href']
        date_posted = job.find('time', class_="job-search-card__listdate")
 
        if not date_posted:
            print(date_posted)
            print("***********")
        else:
            date_posted = date_posted['datetime']
            job_post_date = datetime.strptime(date_posted, '%Y-%m-%d')

            previous_year = datetime.today() - timedelta(days=365)
            
            if job_post_date < previous_year:
                print(f"Skipping job post as older than a year, job_post_date={job_post_date}")
                continue

        if company in jobs_data:
            # if company already present in the dict
            # add keyword and job details
            jobs_data[company].append({keyword: job_link})

        elif len(jobs_data.keys()) < 20:
            # add new company into dict
            jobs_data[company] = [{keyword: job_link}]
        
        # if len(jobs_data.keys()) >=20 :
        #     break

    print(f"Pages={pages}")
    print(f"Job data keys = {len(jobs_data.keys())}")

    # go through more pages 
    if pages > 0: # and len(jobs_data.keys()) < 20:
        pages -= 1
        url = url.replace("start="+str(start), "start="+str(start+25))
        print(url)

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            content = BeautifulSoup(response.content,'html.parser')
        scrap_jobs_details_for_page(content, keyword, pages, start, url, jobs_data)

    return jobs_data


In [10]:
import random 

def get_headers():
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
    ]
    return {'user-agent': user_agents[random.randrange(len(user_agents))]}

def fetch_linked_jobs(keywords, country_id_map, file_prefix): 
    """Scrap the Linked jobs for a provided keywords and conutries upto 20 companies
    :param keyword: unique set of keywords to search on the LinkedIn
    :param country_id_map: country name and corresponding LinkedIn mapped ID for a country
    :param file_prefix: file name prefix to save as JSON
    """
    
    base_url = "https://www.linkedin.com/jobs/search?keywords={}&location={}&geoId={}&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start={}"

    headers = get_headers()

    # data_science_keywords = ["Data Science", "Big data"]

    # search jobs for each country
    for country, geoID in country_id_map.items():
        print(f"Scrapping data for country: {country}")
        jobs_data = {}

        for keyword in keywords:
            start = 0
            search_keyword = keyword.replace(" ", "%20")
            url = base_url.format(search_keyword, country, geoID, start)
            print(url)

            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                content = BeautifulSoup(response.content,'html.parser')

                jobs_found = content.find('span', class_="results-context-header__job-count")
                if jobs_found:
                    jobs_found = int(jobs_found.get_text().strip("+").replace(",",""))
                else:
                    jobs_found = 0
                print(jobs_found)
                # print(content.find('span', class_='results-context-header__query-search').get_text())

                num_of_pages = math.ceil(jobs_found/25)

                jobs_data = scrap_jobs_details_for_page(content, keyword, num_of_pages, start, url, jobs_data)
                time.sleep(2)
            else:
                print(f"Something went wrong while fetching the data: {response.status_code}")
        
        # write output JSON for a country
        with open(file_prefix + "_" + country + ".json", "w") as outfile:
            json.dump(jobs_data, outfile, sort_keys=True)
        

In [None]:
# scrap data science and machine learning jobs from LinkedIn
data_science_keywords = ["Data Science", "Big data", "Machine learning", "Data mining", "Artificial intelligence", 
                         "Predictive modeling", "Statistical analysis", "Data visualization", "Deep learning", 
                         "Natural language processing", "Business intelligence", "Data warehousing", "Data management", 
                         "Data cleaning", "Feature engineering", "Time series analysis", "Text analytics", "Database",
                         "SQL", "NoSQL", "Neural networks", "Regression analysis", "Clustering", "Dimensionality reduction", 
                         "Anomaly detection", "Recommender systems", "Data integration", "Data governance"]

machine_learning = ["Machine learning", "Data preprocessing", "Feature selection", "Feature engineering", 
                    "Data visualization", "Model selection", "Hyperparameter tuning", "Cross-validation", 
                    "Ensemble methods", "Neural networks", "Deep learning", "Convolutional neural networks", 
                    "Recurrent neural networks", "Natural language processing", "Computer vision", "Reinforcement learning", 
                    "Unsupervised learning", "Clustering", "Dimensionality reduction", "Bayesian methods", "Time series analysis",
                    "Random forest", "Gradient boosting", "Support vector machines", "Decision trees", "Regression analysis"]



# base_url = "https://www.linkedin.com/jobs/search?keywords={}&location={}&geoId={}&trk=public_jobs_jobs-search-bar_search-submit&f_TPR={}&position=1&pageNum=0&start={}"

# base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location={}&geoId={}&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start={}"
country_id_map = {"Sweden": 105117694, "Finland": 100456013, "Denmark": 104514075, 
                  "Netherlands": 102890719, "Germany": 101282230}

fetch_linked_jobs(set(data_science_keywords + machine_learning), country_id_map, file_prefix="ds")
# fetch_linked_jobs(['Data governance', 'Feature Engineering'], {"Sweden": 105117694}, file_prefix="ds")

In [138]:
# scrap full stack developer jobs from LinkedIn
fs_keywords= ["Front-end development", "HTML", "CSS", "JavaScript", "React", "Angular", 'Vue.js', "Bootstrap", 
              "jQuery", "responsive design", "Back-end development", "Node.js", "Python", "Ruby", "PHP", "Java",
               ".NET", "SQL", 'NoSQL', "RESTful APIs", "web servers", "Database management",  "MySQL", "PostgreSQL", 
               "MongoDB", "Redis", "Cassandra", "Oracle", "SQL Server", "DevOps", "AWS", "Azure", "Google Cloud", 
               "Docker", "Kubernetes", "Git", "Jenkins", "Travis CI", "CircleCI", "monitoring and logging tools", 
               "Project management"," Agile", "Scrum", "Kanban", "JIRA", "Trello", "Asana", "project planning", 
               "team collaboration"," communication skills"]

country_id_map = {"Sweden": 105117694, "Finland": 100456013, "Denmark": 104514075, 
                  "Netherlands": 102890719, "Germany": 101282230}

fetch_linked_jobs(set(fs_keywords), country_id_map, file_prefix="fs")

Scapping data for country: Sweden
https://www.linkedin.com/jobs/search?keywords=SQL&location=Sweden&geoId=105117694&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start=0
13000
25
Add another company post
None
***********
Add another company post
Pages=520
Job data keys = 20
https://www.linkedin.com/jobs/search?keywords=NoSQL&location=Sweden&geoId=105117694&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start=0
215
25
Pages=9
Job data keys = 21
https://www.linkedin.com/jobs/search?keywords=%20communication%20skills&location=Sweden&geoId=105117694&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start=0
25000
25
Pages=1000
Job data keys = 22
https://www.linkedin.com/jobs/search?keywords=MongoDB&location=Sweden&geoId=105117694&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0&start=0
865
24
Pages=35
Job data keys = 23
https://www.linkedin.com/jobs/search?keywords=HTML&location=Sweden&geoId=105117694&trk=public_job