# **📌LinkedIn Job Scraper**

# Stable Chrome + Chromedriver Setup for Colab

In [None]:
%%bash
# Remove any old Chrome/Chromedriver
rm -rf /usr/bin/chromedriver /usr/local/bin/chromedriver

# Install Google Chrome
wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
apt-get -y install ./google-chrome-stable_current_amd64.deb

# Get Chrome version
CHROME_VERSION=$(google-chrome --version | cut -d " " -f3 | cut -d "." -f1)
echo "Installed Chrome version: $CHROME_VERSION"

# Download matching ChromeDriver
DRIVER_VERSION=$(wget -qO- "https://googlechromelabs.github.io/chrome-for-testing/LATEST_RELEASE_${CHROME_VERSION}")
echo "Matching Driver version: $DRIVER_VERSION"

wget -q "https://storage.googleapis.com/chrome-for-testing-public/${DRIVER_VERSION}/linux64/chromedriver-linux64.zip"
unzip -q chromedriver-linux64.zip
mv chromedriver-linux64/chromedriver /usr/bin/chromedriver
chmod +x /usr/bin/chromedriver

# Verify installation
google-chrome --version
chromedriver --version



Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  libvulkan1 mesa-vulkan-drivers
The following NEW packages will be installed:
  google-chrome-stable libvulkan1 mesa-vulkan-drivers
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.9 MB/130 MB of archives.
After this operation, 438 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libvulkan1 amd64 1.3.204.1-2 [128 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 mesa-vulkan-drivers amd64 23.2.1-1ubuntu3.1~22.04.3 [10.7 MB]
Get:3 /content/google-chrome-stable_current_amd64.deb google-chrome-stable amd64 139.0.7258.127-1 [119 MB]
Fetched 10.9 MB in 1s (7,769 kB/s)
Selecting previously unselected package libvulkan1:amd64.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading data

In [None]:
# checking
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")   # Run headless (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Point to chromedriver
service = Service("/usr/bin/chromedriver")

# Start browser
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get("https://www.google.com")

print("✅ Page title:", driver.title)

driver.quit()


✅ Page title: Google


# FINALIZED

In [None]:
# -----------------------------------------------
# Dependencies: selenium>=4.18, webdriver-manager, undetected-chromedriver,
# pandas, beautifulsoup4, lxml, tenacity, python-dateutil
# -----------------------------------------------
!pip install -q selenium==4.18.1 webdriver-manager undetected-chromedriver pandas beautifulsoup4 lxml tenacity python-dateutil

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for undetected-chromedriver (setup.py) ... [?25l[?25hdone


In [None]:
import os
import json
import time
import random
import pandas as pd
from datetime import datetime
from typing import List, Dict

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [None]:
# -----------------------------------------------
# ==================== CONFIG ====================
# -----------------------------------------------

CONFIG = {
    "keywords": ["Data Analyst", "Software Engineer"],
    "locations": ["Pune, Maharashtra, India", "Bengaluru, Karnataka, India"],
    "result_limit": 30,           # Max jobs per keyword-location combo
    "fetch_descriptions": False,  # Toggle to scrape full job descriptions

    # --- FILTERS ---
    "filters": {
        "remote": None,           # options: None, "1" (remote), "2" (on-site), "3" (hybrid)
        "date_posted": "r86400", # past week (None, r86400=24h, r604800=week, r2592000=month)
        "experience": None,       # options: 1=Internship, 2=Entry, 3=Associate, etc.
        "employment_type": None,  # options: 1=Full-time, 2=Part-time, etc.
    },

    "dedupe_on": ["Link"],        # Remove duplicates based on job link
    "save_chunk_size": 50,        # Save after every N jobs
    "max_retries": 3,             # Retry failed pages/cards N times
    "retry_delay": 3,             # Seconds between retries

    # --- HUMAN-LIKE DELAY ---
    "random_delay": {
        "enabled": False,          # True = wait random seconds, False = skip waiting
        "min_sec": 2,             # Minimum seconds to wait
        "max_sec": 6              # Maximum seconds to wait
    }
}

In [None]:
# -----------------------------------------------
# ==================== DRIVER SETUP ====================
# -----------------------------------------------

def setup_driver() -> uc.Chrome:
    options = uc.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

In [None]:
# -----------------------------------------------
# ==================== AUTH ====================
# -----------------------------------------------

def login_linkedin(driver):
    li_at = os.getenv("LI_AT")
    username = os.getenv("LINKEDIN_USER")
    password = os.getenv("LINKEDIN_PASS")

    driver.get("https://www.linkedin.com/login")

    if li_at:
        driver.add_cookie({"name": "li_at", "value": li_at, "domain": ".linkedin.com"})
        driver.refresh()
        print("[INFO] Logged in with li_at cookie")
    elif username and password:
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID, "username"))).send_keys(username)
        driver.find_element(By.ID, "password").send_keys(password)
        driver.find_element(By.XPATH, "//button[@type='submit']").click()
        print("[INFO] Logged in with username/password")
    else:
        print("[WARN] No login → scraping public jobs only")

In [None]:
# -----------------------------------------------
# ==================== URL BUILDER ====================
# -----------------------------------------------

def build_search_url(keyword: str, location: str, start: int = 0, filters: dict = None) -> str:
    url = f"https://www.linkedin.com/jobs/search/?keywords={keyword}&location={location}&start={start}"
    if filters:
        if filters.get("remote"): url += f"&f_WT={filters['remote']}"
        if filters.get("date_posted"): url += f"&f_TPR={filters['date_posted']}"
        if filters.get("experience"): url += f"&f_E={filters['experience']}"
        if filters.get("employment_type"): url += f"&f_JT={filters['employment_type']}"
    return url

In [None]:
# -----------------------------------------------
# ==================== RANDOM DELAY ====================
# -----------------------------------------------

def apply_random_delay():
    if CONFIG["random_delay"]["enabled"]:
        delay = random.uniform(CONFIG["random_delay"]["min_sec"], CONFIG["random_delay"]["max_sec"])
        print(f"[WAIT] Sleeping {delay:.2f} seconds (human-like delay)")
        time.sleep(delay)

In [None]:
# -----------------------------------------------
# ==================== JOB EXTRACTION ====================
# -----------------------------------------------

def extract_job_card(card, fetch_desc: bool, driver, retries=CONFIG["max_retries"]) -> Dict:
    """Extracts job info from a single LinkedIn job card with retries"""
    for attempt in range(retries):
        try:
            title = card.find_element(By.CSS_SELECTOR, "h3").text.strip()
            company = card.find_element(By.CSS_SELECTOR, "h4").text.strip()
            loc = card.find_element(By.CSS_SELECTOR, "span.job-search-card__location").text.strip()
            link = card.find_element(By.TAG_NAME, "a").get_attribute("href").split("?")[0]
            date_posted = card.find_element(By.CSS_SELECTOR, "time").get_attribute("datetime")

            job = {
                "Title": title,
                "Company": company,
                "Location": loc,
                "Date Posted": date_posted,
                "Link": link,
                "Scraped At": datetime.utcnow().isoformat()
            }

            # Fetch job description if enabled
            if fetch_desc:
                try:
                    card.click()
                    WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "div.jobs-description"))
                    )
                    soup = BeautifulSoup(driver.page_source, "lxml")
                    desc = soup.select_one("div.jobs-description__content")
                    if desc:
                        job["Job Description"] = desc.get_text(" ", strip=True)
                except Exception:
                    job["Job Description"] = None

            return job
        except Exception:
            time.sleep(CONFIG["retry_delay"])
            print(f"[WARN] Job extraction retry {attempt+1}/{retries}")
    return None

In [None]:
# -----------------------------------------------
# ==================== SCRAPER ====================
# -----------------------------------------------

def scrape_jobs(driver, keyword: str, location: str, limit: int, fetch_desc: bool, filters: dict) -> List[Dict]:
    jobs = []
    seen_links = set()
    start = 0

    while len(jobs) < limit:
        url = build_search_url(keyword, location, start, filters)

        # Retry page load
        for attempt in range(CONFIG["max_retries"]):
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.jobs-search__results-list li"))
                )
                break
            except Exception:
                print(f"[WARN] Page load failed → retry {attempt+1}/{CONFIG['max_retries']}")
                time.sleep(CONFIG["retry_delay"])
        else:
            print("[ERROR] Skipping page due to repeated failures")
            break

        cards = driver.find_elements(By.CSS_SELECTOR, "ul.jobs-search__results-list li")
        if not cards:
            break

        for card in cards:
            job = extract_job_card(card, fetch_desc, driver)
            if job and job["Link"] not in seen_links:
                jobs.append(job)
                seen_links.add(job["Link"])

                # Apply random human-like delay
                apply_random_delay()

            if len(jobs) >= limit:
                break

        start += len(cards)

    return jobs

In [None]:
# -----------------------------------------------
# ==================== MAIN ====================
# -----------------------------------------------

def main(CONFIG):
    driver = setup_driver()
    login_linkedin(driver)

    all_jobs = []
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    csv_path = f"linkedin_jobs_{ts}.csv"
    json_path = f"linkedin_jobs_{ts}.json"

    for kw in CONFIG["keywords"]:
        for loc in CONFIG["locations"]:
            jobs = scrape_jobs(driver, kw, loc, CONFIG["result_limit"], CONFIG["fetch_descriptions"], CONFIG["filters"])
            all_jobs.extend(jobs)
            print(f"[INFO] Fetched {len(jobs)} jobs for {kw} in {loc}")

            # Save incrementally
            if len(all_jobs) >= CONFIG["save_chunk_size"]:
                df = pd.DataFrame(all_jobs)
                if CONFIG["dedupe_on"]:
                    df = df.drop_duplicates(subset=CONFIG["dedupe_on"])
                df.to_csv(csv_path, index=False)
                df.to_json(json_path, orient="records", indent=2)
                print(f"[AUTO-SAVE] Saved {len(df)} jobs → {csv_path}, {json_path}")

    driver.quit()

    # Final save
    df = pd.DataFrame(all_jobs)
    if CONFIG["dedupe_on"]:
        df = df.drop_duplicates(subset=CONFIG["dedupe_on"])
    df.to_csv(csv_path, index=False)
    df.to_json(json_path, orient="records", indent=2)

    print(f"[SUMMARY] Total scraped: {len(all_jobs)}, Unique after dedupe: {len(df)}")
    print(f"Saved → {csv_path}, {json_path}")
    print(df.head())
    print(df.shape)

if __name__ == "__main__":
    main(CONFIG)

[WARN] No login → scraping public jobs only
[INFO] Fetched 30 jobs for Data Analyst in Pune, Maharashtra, India
[INFO] Fetched 30 jobs for Data Analyst in Bengaluru, Karnataka, India
[AUTO-SAVE] Saved 60 jobs → linkedin_jobs_20250816_105146.csv, linkedin_jobs_20250816_105146.json
[INFO] Fetched 30 jobs for Software Engineer in Pune, Maharashtra, India
[AUTO-SAVE] Saved 88 jobs → linkedin_jobs_20250816_105146.csv, linkedin_jobs_20250816_105146.json
[INFO] Fetched 30 jobs for Software Engineer in Bengaluru, Karnataka, India
[AUTO-SAVE] Saved 118 jobs → linkedin_jobs_20250816_105146.csv, linkedin_jobs_20250816_105146.json
[SUMMARY] Total scraped: 120, Unique after dedupe: 118
Saved → linkedin_jobs_20250816_105146.csv, linkedin_jobs_20250816_105146.json
                                               Title  \
0                                            Analyst   
1                                          Associate   
2                                       Data Analyst   
3  Senior Analys

## Author
Hemant K  
📧 hemant777.karpe@gmail.com
🔗 [LinkedIn](https://www.linkedin.com/in/hemant-karpe)