In [1]:
import json
import pandas as pd
import time
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

In [2]:
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
service = Service(ChromeDriverManager().install())

In [3]:
driver = webdriver.Chrome(service=service, options=options)

In [4]:
def calculate_posted_date(posted_on):
    posted_date = None
    if "today" in posted_on.lower():
        posted_date = datetime.now()
    elif "yesterday" in posted_on.lower():
        posted_date = datetime.now() - timedelta(days=1)
    else:
        try:
            num, period = posted_on.split()[1:3]
            if "week" in period:
                posted_date = datetime.now() - timedelta(weeks=int(num))
            elif "day" in period:
                posted_date = datetime.now() - timedelta(days=int(num))
        except Exception as e:
            print(f"Error parsing date: {e}")
            posted_date = None
    return posted_date.strftime('%d-%m-%Y') if posted_date else None

In [5]:
def extract_job_info(job_card):
    try:
        job_info = {}
        job_info['company'] = job_card.find('h4', class_='base-search-card__subtitle').text.strip() if job_card.find('h4', class_='base-search-card__subtitle') else 'null'
        job_info['job_title'] = job_card.find('h3', class_='base-search-card__title').text.strip() if job_card.find('h3', class_='base-search-card__title') else 'null'
        job_info['linkedin_job_id'] = job_card.find('a')['href'].split('/')[-2] if job_card.find('a') else 'null'
        job_info['location'] = job_card.find('span', class_='job-search-card__location').text.strip() if job_card.find('span', class_='job-search-card__location') else 'null'
        job_info['posted_on'] = job_card.find('time').text.strip() if job_card.find('time') else 'null'
        job_info['posted_date'] = calculate_posted_date(job_info['posted_on'])

        # Navigate to the job details page to extract additional information
        job_link = job_card.find('a')['href'] if job_card.find('a') else None
        if job_link:
            driver.get(job_link)
            time.sleep(2)
            job_soup = BeautifulSoup(driver.page_source, 'html.parser')
            job_info['work_mode'] = job_soup.find('span', text='Work mode').find_next('span').text.strip() if job_soup.find('span', text='Work mode') else 'null'
            job_info['employment_type'] = job_soup.find('span', text='Employment type').find_next('span').text.strip() if job_soup.find('span', text='Employment type') else 'null'
            job_info['skills'] = [skill.text.strip() for skill in job_soup.find_all('span', class_='job-criteria__text job-criteria__text--criteria')] if job_soup.find_all('span', class_='job-criteria__text job-criteria__text--criteria') else []
        else:
            job_info['work_mode'] = 'null'
            job_info['employment_type'] = 'null'
            job_info['skills'] = []

        return job_info
    except Exception as e:
        print(f"Error extracting job info: {e}")
        return None


In [6]:
# URLs
urls = []
while True:
    url = input("Enter LinkedIn job search URL (or type 'done' to finish): ")
    if url.lower() == 'done':
        break
    urls.append(url)

Enter LinkedIn job search URL (or type 'done' to finish):  https://www.linkedin.com/jobs/search?keywords=&location=India&geoId=102713980&f_TPR=r86400&f_C=1586&position=1&pageNum=0
Enter LinkedIn job search URL (or type 'done' to finish):  done


In [7]:
# Collecting job data
job_data = []
for url in urls:
    driver.get(url)
    try:
       
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ul.jobs-search__results-list li'))
        )
        time.sleep(2)  
    except Exception as e:
        print(f"Error loading page: {e}")
        continue

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    
    print("Page source loaded for URL:", url)
    
    job_cards = soup.select('ul.jobs-search__results-list li')
    
    
    print(f"Found {len(job_cards)} job cards")

    for job_card in job_cards:
        job_info = extract_job_info(job_card)
        if job_info:
            # Debug print: Verify job info extracted
            print(f"Extracted job info: {job_info}")
            job_data.append(job_info)
        if len(job_data) >= 50:
            break
    if len(job_data) >= 50:
        break

Page source loaded for URL: https://www.linkedin.com/jobs/search?keywords=&location=India&geoId=102713980&f_TPR=r86400&f_C=1586&position=1&pageNum=0
Found 60 job cards


  job_info['work_mode'] = job_soup.find('span', text='Work mode').find_next('span').text.strip() if job_soup.find('span', text='Work mode') else 'null'
  job_info['employment_type'] = job_soup.find('span', text='Employment type').find_next('span').text.strip() if job_soup.find('span', text='Employment type') else 'null'


Extracted job info: {'company': 'Amazon', 'job_title': 'Business Operations Analyst', 'linkedin_job_id': 'view', 'location': 'Bengaluru, Karnataka, India', 'posted_on': '2 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}
Extracted job info: {'company': 'Amazon', 'job_title': 'Virtual Customer Support Associate - Tamil Nadu, India', 'linkedin_job_id': 'view', 'location': 'Tamil Nadu, India', 'posted_on': '19 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}
Extracted job info: {'company': 'Amazon', 'job_title': 'Sales Associate, Direct Sales', 'linkedin_job_id': 'view', 'location': 'Gurugram, Haryana, India', 'posted_on': '2 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}
Extracted job info: {'company': 'Amazon', 'job_title': 'Production Planning Analyst, Supply Chain', 'linkedin_job_id': 'view', 'location': 'Bengaluru, Karnataka, India', 'posted_on':

In [8]:
driver.quit()

print(f"Collected job data: {job_data}")

Collected job data: [{'company': 'Amazon', 'job_title': 'Business Operations Analyst', 'linkedin_job_id': 'view', 'location': 'Bengaluru, Karnataka, India', 'posted_on': '2 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}, {'company': 'Amazon', 'job_title': 'Virtual Customer Support Associate - Tamil Nadu, India', 'linkedin_job_id': 'view', 'location': 'Tamil Nadu, India', 'posted_on': '19 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}, {'company': 'Amazon', 'job_title': 'Sales Associate, Direct Sales', 'linkedin_job_id': 'view', 'location': 'Gurugram, Haryana, India', 'posted_on': '2 hours ago', 'posted_date': None, 'work_mode': 'null', 'employment_type': 'null', 'skills': []}, {'company': 'Amazon', 'job_title': 'Production Planning Analyst, Supply Chain', 'linkedin_job_id': 'view', 'location': 'Bengaluru, Karnataka, India', 'posted_on': '2 hours ago', 'posted_date': None, 'work_mode': 'null'

In [9]:
if job_data:
    with open('job_data.json', 'w') as json_file:
        json.dump(job_data, json_file, indent=4)
    print("job_data.json file has been saved.")
else:
    print("No job data collected to save in JSON file.")

df = pd.DataFrame(job_data)
df.to_csv('job_data.csv', index=False)
print("job_data.csv file has been saved.")


job_data.json file has been saved.
job_data.csv file has been saved.
