In [None]:
import requests
import random
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# Your search parameters
keywords = """Data%20Engineer"%20OR%20"Data%20Scientist"""
location = "United%20States"
time_filter = "r1800"  # Posted within ~30minutes 
experience_level = "2%2C3"   #( %2C means comma in URL encoding 1=Internship, 2=Entry, 3=Associate ) # &f_E={experience_level}
start = 0 # Starting position (first page)

# Lists to store all data
all_job_ids = []
all_job_details = []

# Pagination variables
start = 0
jobs_per_page = 10
has_more_jobs = True


In [None]:
while has_more_jobs:
    list_url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={keywords}&location={location}&f_TPR={time_filter}&start={start}"
    
    # Get page content
    response = requests.get(list_url)
    list_data = response.text
    list_soup = BeautifulSoup(list_data, 'html.parser')
    page_jobs = list_soup.find_all("li")
    
    # If no jobs found on this page, exit the loop
    if len(page_jobs) == 0:
        has_more_jobs = False
        print(f"No more jobs found after position {start}")
        break
    
    # Get job IDs from current page
    page_ids = []
    for jobs in page_jobs:
        base_card_div = jobs.find("div", {"class": "base-card"})
        if base_card_div and base_card_div.get("data-entity-urn"):
            job_id = base_card_div.get("data-entity-urn").split(":")[-1]
        print(f"Found job ID: {job_id}")
        page_ids.append(job_id)
    
    # Add page IDs to overall list
    all_job_ids.extend(page_ids)
    
    # Move to next page
    start += jobs_per_page
    
    # Wait before next page request
    time.sleep(random.uniform(2, 4))
    
    # If fewer jobs than expected, we're on the last page
    if len(page_ids) < jobs_per_page:
        has_more_jobs = False
        print(f"Last page had {len(page_ids)} jobs instead of {jobs_per_page}, stopping pagination")
print(list_url)
print(f"Total job IDs found: {len(all_job_ids)}")

In [None]:
job_list = []

for job_id in all_job_ids:  

    # Sleep BEFORE making the request
    time.sleep(random.uniform(0.5, 4.5))  # Random sleep to avoid being blocked
    
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    job_response = requests.get(job_url)
    job_soup = BeautifulSoup(job_response.text, 'html.parser')
    job_post = {}
    # Extract company name
    job_post["job_id"] = job_id
    
    try:
        job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    except AttributeError:
        job_post["company_name"] = "Not available"
    
    # Extract job title
    try:
        job_post["job_title"] = job_soup.find("h2", {"class": "top-card-layout__title"}).text.strip()
    except AttributeError:
        job_post["job_title"] = "Not available"
    
    # Extract location
    try:
        job_post["location"] = job_soup.find("span", {"class": "topcard__flavor topcard__flavor--bullet"}).text.strip()
    except AttributeError:
        job_post["location"] = "Not available"
    
    # Extract time posted
    try:
        job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text"}).text.strip()
    except AttributeError:
        job_post["time_posted"] = "Not available"
    
    # Extract job description
    try:
        description_div = job_soup.find("div", {"class": "description__text description__text--rich"})
        job_post["job_description"] = description_div.text.strip()
    except AttributeError:
        job_post["job_description"] = "Not available"

    job_list.append(job_post)
    print(f"Processed job: {job_post['job_title']} at {job_post['company_name']}")

In [None]:
df = pd.DataFrame(job_list)

# Display results
print(f"Total jobs collected: {len(df)}")
df

In [None]:
# Save to CSV on your local machine
df.to_csv("linkedin_jobs.csv", index=False)