In [None]:
from concurrent.futures import ThreadPoolExecutor
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import random
import logging
import threading
import time
import re
import requests
import pandas as pd

In [None]:
title = "Machine Learning"
location = "California"
max_pages = 5
job_list = []

In [None]:
logging.basicConfig(level=logging.DEBUG)
retry_strategy = Retry(
    total = 1,
    backoff_factor = 10,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["GET"]
)
session = requests.Session()
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
logger = logging.getLogger(__name__)
base_url = "https://www.linkedin.com"

In [None]:
def fetch_job_details(job_id):
    try:
        job_url = f"{base_url}/jobs-guest/jobs/api/jobPosting/{job_id}"
        response = session.get(job_url, timeout=300)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching job details for job ID {job_id}: {e}")
        return None

In [None]:
def extract_job_details(job_soup):
    job_post = {}
    job_dis_element = job_soup.find("section", {"class": "core-section-container my-3 description"})
    if job_dis_element:
        job_dis = job_dis_element.get_text(separator="\n")
        job_dis = re.sub(r'<[^>]*>', '', job_dis)
        job_dis = re.sub(r'\s+', ' ', job_dis)
        job_post["description"] = job_dis.strip()

    job_post["job_title"] = job_soup.find("h2", {"class": "top-card-layout__title"}).text.strip()
    job_post["company_name"] = job_soup.find("a", {"class": "topcard__org-name-link"}).text.strip()
    job_post["time_posted"] = job_soup.find("span", {"class": "posted-time-ago__text"}).text.strip()
    
    num_applicants_element = job_soup.find("span", {"class": "num-applicants__caption"})
    job_post["num_applicants"] = str(num_applicants_element.text).strip() if num_applicants_element else None
    
    return job_post


In [None]:
processed_job_ids = set()

def process_job_listing(job):
    base_card_div = job.find("div", {"class": "base-card"})
    if base_card_div:
        job_id = base_card_div.get("data-entity-urn")
        if job_id and job_id not in processed_job_ids:
            processed_job_ids.add(job_id)
            job_id = job_id.split(":")[3]
            job_details = fetch_job_details(job_id)
            if job_details:
                job_soup = BeautifulSoup(job_details, "html.parser")
                job_post = extract_job_details(job_soup)
                job_post["job_id"] = job_id
                return job_post
    return None


###### https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#:~:text=Solution%E2%80%8B,scraper%20or%20a%20real%20user.

In [None]:
start = 0
while start < max_pages * 25:
    user_agents_list = [
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
    'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36',
]

    list_url = f"{base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={title}&location={location}&start={start}"
    response = session.get(list_url, headers={'User-Agent': random.choice(user_agents_list)})
    list_soup = BeautifulSoup(response.text, "html.parser")
    page_jobs = list_soup.find_all("li")
    
    if not page_jobs:
        logger.warning(f"No job listings found on page {start // 25 + 1}. Exiting Loop.")
        break

semaphore = threading.Semaphore(2)

def process_job_with_semaphore(job):
    with semaphore:
        return process_job_listing(job)

# Use ThreadPoolExecutor with max_workers=2
with ThreadPoolExecutor(max_workers=2) as executor:
    job_posts = list(executor.map(process_job_with_semaphore, page_jobs))
    job_posts = [job for job in job_posts if job is not None]
    job_list.extend(job_posts)    
    start += 1
    time.sleep(2)

logger.info(f"Total job listings found: {len(job_list)}")

In [None]:
jobss_df = pd.DataFrame(job_list)
jobs_df = jobss_df.dropna()
jobs_df.to_csv('cali_11_ml.csv', index = False)