<h1> Collect Job Post Data </h1>
<p> Given a query, find applicable Indeed.com posts </p>

In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import httpx
import re
import json
import time
# import datatime

Uses "data scientist" and "Santa Clara" as default query fields. 
Make a request mimicking the behavior of a browser request.

In [5]:

# query = input("Enter your job query: ")
# location = input("Enter your location: ")
query = "data scientist"
location = "Santa Clara"

# process query 
words_query = query.split()
processed_query = "+".join(words_query)

# use this to get it to work on all locations
processed_location = requests.utils.quote(location)

# Add headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    # "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    # "Connection": "keep-alive",
    "Referer": "https://www.indeed.com/",
}

# make the url
URL = f"https://www.indeed.com/jobs?q={processed_query}&l={processed_location}"

# request and parse
response = requests.get(URL, headers=headers, timeout=10)
response.raise_for_status()

print(URL)


HTTPError: 403 Client Error: Forbidden for url: https://www.indeed.com/jobs?q=data+scientist&l=Santa%20Clara

Add all of the job search results into a Dataframe**

```python
all_jobs = {
    Title: the job title/position,
    Company Name: the company name of the job post
    Location: the location information of the job post
    Link: the redirection link to get more details of the job
}

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests # Assuming requests is imported elsewhere

# Initialize all_jobs as an empty LIST of dictionaries
all_jobs = [] 
default_indeed_url = "https://www.indeed.com"

try:
    # Initialize soup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get basic job information for each job card
    company_names = soup.find_all(attrs={"data-testid": "company-name"})
    text_location = soup.find_all(attrs={"data-testid": "text-location"})
    
    # set total size of jobs
    max_jobs = max(len(company_names), len(text_location))

    # Fill in the company name/text location first
    for i in range(max_jobs):
        company_name = company_names[i].get_text(strip=True) if i < len(company_names) else 'N/A'
        location = text_location[i].get_text(strip=True) if i < len(text_location) else 'N/A'
        
        # Create a new dictionary for each job and append it to the list
        all_jobs.append({
            "Title": 'N/A', # Placeholder 
            "Location": location,
            "Company Name": company_name, 
            "Link": 'N/A', # Placeholder 
        })
        
    # Check for CAPTCHA/blocking page
    if "captcha" in response.text.lower() or "not found" in soup.title.text.lower():
         print("CAPTCHA or Block detected. The scraping attempt was blocked.")
    else:
        # Get Title and Link (Update Existing Job Cards) ---
        job_links = soup.find_all('a', class_='jcs-JobTitle')
        
        # We assume job_links corresponds to the records in all_jobs by index
        for i, title_element in enumerate(job_links):
            if i >= len(all_jobs):
                print(f"Warning: Found more links ({len(job_links)}) than initial records ({len(all_jobs)}).")
                break
                
            # Extract the Title from the span's 'title' attribute
            title_span = title_element.find("span")
            raw_title = title_span.get("title") if title_span else 'N/A'
            # print("title_element: ", title_element)
            
            # Extract the Link (partial path)
            indiv_link = title_element.get('href')
            full_link = default_indeed_url + indiv_link
            
            # Update the corresponding dictionary in the all_jobs list
            all_jobs[i]['Title'] = raw_title
            all_jobs[i]['Link'] = full_link
            

    # Put into dataframe
    df_jobs = pd.DataFrame(all_jobs)
    
    print(f"Total jobs successfully parsed: {len(df_jobs)}")
    
except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error (Status Code {e.response.status_code}): The server definitively blocked the request.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    
df_jobs

title_element:  <a aria-label="full details of Data Scientist" class="jcs-JobTitle css-1baag51 eu4oa1w0" data-hide-spinner="true" data-hiring-event="false" data-jk="78e6280457103317" data-mobtk="1jb4q3411gqlp88m" href="/rc/clk?jk=78e6280457103317&amp;bb=Nm8sqwyT04Wia_3UsmYCTYA7U8sa7KpzE8QPHEHXPeaGvaSwoLGtdppyXdLYnSc7glFAIdWno8-KWwMm_wZ2-CzCV66aIs0Wft-gULfdlggrY9wNLNEmb9HkhkkkY2RqGSajWNKOkLWcIWQPOC46IQ%3D%3D&amp;xkcb=SoDa67M3qk8DXQADNr0LbzkdCdPP&amp;fccid=fa5bbc12d1498bc0&amp;vjs=3" id="job_78e6280457103317" role="button"><span id="jobTitle-78e6280457103317" title="Data Scientist">Data Scientist</span></a>
title_element:  <a aria-label="full details of Data Scientist" class="jcs-JobTitle css-1baag51 eu4oa1w0" data-hide-spinner="true" data-hiring-event="false" data-jk="a4f3e98fa98cb261" data-mobtk="1jb4q3411gqlp88m" href="/rc/clk?jk=a4f3e98fa98cb261&amp;bb=Nm8sqwyT04Wia_3UsmYCTaqJ-mbPN_BVFqrz3E2p8PjK-eNtzKwWpgU2Ffe75aPOJXm6gCXmLyYTG8P-3MKpvFU9Pp77w5VjPdf0KcQ0w7P__6kQHMmQjNW9J7dtfeCjTimGA

Unnamed: 0,Title,Location,Company Name,Link
0,Data Scientist,"Pleasanton, CA 94566",Sajix Software Solution Private Limited,https://www.indeed.com/rc/clk?jk=78e6280457103...
1,Data Scientist,"Pleasanton, CA 94566",Sajix,https://www.indeed.com/rc/clk?jk=a4f3e98fa98cb...
2,Data Scientist,"Pleasanton, CA 94566",Sajix,https://www.indeed.com/viewjob?jk=cdef01234567...
3,Data Scientist (Tapestry),"Mountain View, CA",Loon,https://www.indeed.com/rc/clk?jk=8bb8e7ae29b53...
4,"Research Scientist, Foundation Model (LLM)","San Jose, CA",ByteDance,https://www.indeed.com/rc/clk?jk=780362905ef9d...
5,Data Scientist,"Santa Clara, CA",NutaNXT Technologies,https://www.indeed.com/rc/clk?jk=2ad5b66e52509...
6,"Data Scientist (Dublin, CA)","Hybrid work in Dublin, CA 94568",SavvyMoney,https://www.indeed.com/rc/clk?jk=f33e369c31172...
7,Machine Learning Engineer,"Hybrid work in Santa Clara, CA 95050",Edurech Technoogy,https://www.indeed.com/rc/clk?jk=54d0e2fb38ac1...
8,Data Scientist,"Hybrid work in Pleasanton, CA 94566",Flex Employee Services,https://www.indeed.com/rc/clk?jk=18dd050faef8e...
9,Machine Learning Engineer,"Santa Clara, CA",Amiri Recruiting,https://www.indeed.com/rc/clk?jk=674087e716231...


Find more detailed job descriptions by vising the individual links visited.

In [None]:
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd # Assuming you're using this

# Define Robust Headers (Crucial for Indeed) 
# Use a strong, updated User-Agent and a Referer

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "Referer": URL, # Important: change 'your+query' and 'your+location'
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}

# Initialize a Session
session = requests.Session()
session.headers.update(HEADERS)

# Iterate and Request
successful_details = 0
failed_details = 0

for i, job in enumerate(all_jobs):
    url = job.get("Link")
    job_title = job.get("Title", "Unknown Job")

    print(f"\n--- Processing Job {i+1}: {job_title} ---")
    print(f"URL Type: {'viewjob' if '/viewjob?' in url else 'redirect'}")
    
    try:
        # from /rc/clk? and /pagead/clk? to the final /viewjob? page.
        
        response = session.get(url, timeout=15)
        response.raise_for_status() # Raise exception for 4xx or 5xx status codes
        
        # Anti-Scraping Check ---
        if "captcha" in response.text.lower() or "not found" in BeautifulSoup(response.text, 'html.parser').title.text.lower():
             print("CAPTCHA/Block detected or Job Not Found. Skipping details extraction.")
             failed_details += 1
             job['Description'] = 'BLOCKED/NOT FOUND'
             continue
        
        # Success: Proceed to Scrape Details ---
        
        # Parse the details page HTML
        detail_soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the job description container (this class is correct for detail pages)
        description_container = detail_soup.find('div', class_='jobsearch-JobComponent-description')
        
        if description_container:
            full_description = description_container.get_text('\n', strip=True)
            job['Description'] = full_description
            successful_details += 1
            print(f"Successfully extracted details for {job_title}. Description length: {len(full_description)} chars.")
        else:
            print("Description container not found on the detail page (likely a complex redirect or missing content).")
            failed_details += 1
            job['Description'] = 'CONTAINER MISSING'


    except requests.exceptions.RequestException as e:
        print(f"Request Failed for {job_title}: {e}")
        failed_details += 1
        job['Description'] = f'REQUEST ERROR: {e}'
        
    finally:
        # Wait between 2 and 5 seconds to reduce rate-limiting risk.
        delay = 3
        print(f"Waiting for {delay} seconds before next request...")
        time.sleep(delay) 

print(f"\n\n--- Summary ---")
print(f"Total jobs processed: {len(all_jobs)}")
print(f"Successful details extracted: {successful_details}")
print(f"Failed/Blocked requests: {failed_details}")


--- Processing Job 1: Data Scientist ---
URL Type: redirect
Successfully extracted details for Data Scientist. Description length: 2885 chars.
Waiting for 3 seconds before next request...

--- Processing Job 2: Data Scientist ---
URL Type: redirect
Successfully extracted details for Data Scientist. Description length: 2892 chars.
Waiting for 3 seconds before next request...

--- Processing Job 3: Data Scientist ---
URL Type: viewjob
Request Failed for Data Scientist: 404 Client Error: Not Found for url: https://www.indeed.com/viewjob?jk=cdef0123456789ab
Waiting for 3 seconds before next request...

--- Processing Job 4: Data Scientist (Tapestry) ---
URL Type: redirect
Successfully extracted details for Data Scientist (Tapestry). Description length: 6140 chars.
Waiting for 3 seconds before next request...

--- Processing Job 5: Research Scientist, Foundation Model (LLM) ---
URL Type: redirect
Successfully extracted details for Research Scientist, Foundation Model (LLM). Description len

Create a master dataframe that includes the new description text

```python
all_jobs = {
    Title: the job title/position,
    Company Name: the company name of the job post
    Location: the location information of the job post
    Link: the redirection link to get more details of the job
    Description: the raw text of the job description
}
```

Record this into the json file, to minimize requests. 

In [None]:
final_df = pd.DataFrame(all_jobs)

output_filename = 'indeed_job_data_sample.json'

try:
    final_df.to_json(
        output_filename, 
        orient='records',
        lines=False, 
        indent=4    
    )
    print(f"\nSuccessfully saved {len(df_jobs)} records to {output_filename}")
    
except Exception as e:
    print(f"\n Error saving JSON file: {e}")

final_df


Successfully saved 16 records to indeed_job_data_sample.json


Unnamed: 0,Title,Location,Company Name,Link,Description
0,Data Scientist,"Pleasanton, CA 94566",Sajix Software Solution Private Limited,https://www.indeed.com/rc/clk?jk=78e6280457103...,Profile insights\nFind out how your skills ali...
1,Data Scientist,"Pleasanton, CA 94566",Sajix,https://www.indeed.com/rc/clk?jk=a4f3e98fa98cb...,Profile insights\nFind out how your skills ali...
2,Data Scientist,"Pleasanton, CA 94566",Sajix,https://www.indeed.com/viewjob?jk=cdef01234567...,REQUEST ERROR: 404 Client Error: Not Found for...
3,Data Scientist (Tapestry),"Mountain View, CA",Loon,https://www.indeed.com/rc/clk?jk=8bb8e7ae29b53...,Profile insights\nFind out how your skills ali...
4,"Research Scientist, Foundation Model (LLM)","San Jose, CA",ByteDance,https://www.indeed.com/rc/clk?jk=780362905ef9d...,Profile insights\nFind out how your skills ali...
5,Data Scientist,"Santa Clara, CA",NutaNXT Technologies,https://www.indeed.com/rc/clk?jk=2ad5b66e52509...,Profile insights\nFind out how your skills ali...
6,"Data Scientist (Dublin, CA)","Hybrid work in Dublin, CA 94568",SavvyMoney,https://www.indeed.com/rc/clk?jk=f33e369c31172...,Profile insights\nFind out how your skills ali...
7,Machine Learning Engineer,"Hybrid work in Santa Clara, CA 95050",Edurech Technoogy,https://www.indeed.com/rc/clk?jk=54d0e2fb38ac1...,Profile insights\nFind out how your skills ali...
8,Data Scientist,"Hybrid work in Pleasanton, CA 94566",Flex Employee Services,https://www.indeed.com/rc/clk?jk=18dd050faef8e...,Profile insights\nFind out how your skills ali...
9,Machine Learning Engineer,"Santa Clara, CA",Amiri Recruiting,https://www.indeed.com/rc/clk?jk=674087e716231...,Profile insights\nFind out how your skills ali...


Extract some keywords using REGEX

In [None]:
import re
import pandas as pd
import time
import random

# --- List of all relevant headers to look for ---
HEADER_PHRASES = [
    'Minimum Qualifications', 'Qualifications', 'Requirements', 'What You Bring',
    'Key Responsibilities', 'Responsibilities', 'Job Duties', 'Your Role'
]

# Map multiple phrases to single standardized column names
NORMALIZED_KEYS = {
    'QUALIFICATIONS': 'Qualifications', 
    'REQUIREMENTS': 'Qualifications', 
    'MINIMUM QUALIFICATIONS': 'Qualifications',
    'MINIMUM REQUIREMENTS': 'Qualifications',
    'WHAT YOU BRING': 'Qualifications',
    'KEY RESPONSIBILITIES': 'Responsibilities', 
    'RESPONSIBILITIES': 'Responsibilities',
    'JOB DUTIES': 'Responsibilities',
    'YOUR ROLE': 'Responsibilities',
}

def extract_key_terms_by_header(description_text):
    """
    Finds key section headers in the description and extracts the content 
    that immediately follows until the next header or the end of the text.
    """
    extracted_sections = {}
    
    if not description_text or description_text == 'N/A' or description_text.startswith('BLOCKED'):
        return extracted_sections

    # Regex to find the headers, match the header text
    # The '|' joins the headers to search for any of them.
    HEADER_PATTERN = r'(' + '|'.join([re.escape(h) for h in HEADER_PHRASES]) + r')\s*:?\s*\n'
    
    # Use finditer to find the start and end positions of all headers
    matches = list(re.finditer(HEADER_PATTERN, description_text, re.IGNORECASE))
    
    if not matches:
        return extracted_sections

    for i, match in enumerate(matches):
        # The raw header text (e.g., 'Key Responsibilities')
        header_text = match.group(1).upper().strip() 
        start_index = match.end() # Start capturing content AFTER the header match

        # Determine the end of the section (start of the next header or end of document)
        if i + 1 < len(matches):
            end_index = matches[i+1].start()
        else:
            end_index = len(description_text)
            
        content = description_text[start_index:end_index].strip()
        
        # Normalize the header name
        normalized_key = next((v for k, v in NORMALIZED_KEYS.items() if k in header_text), None)

        if normalized_key and content and len(content) > 10:
            # If the key already exists (e.g., both "Qualifications" and "Requirements" are present), join the content
            if normalized_key in extracted_sections:
                extracted_sections[normalized_key] += "\n\n" + content
            else:
                extracted_sections[normalized_key] = content

    return extracted_sections


# Main Loop to Populate the new DataFrame

key_terms_list = []

# Assuming 'all_jobs' is your list of dictionaries containing 'Title', 'Link', and 'Description'
for job in all_jobs:
    # 1. Extract the sections from the description text
    extracted_sections = extract_key_terms_by_header(job.get('Description', ''))
    
    # 2. Create a new dictionary entry for the key_terms DataFrame
    key_terms_entry = {
        'Title': job.get('Title', 'N/A'),        
        'Link': job.get('Link', 'N/A'),
        'Qualifications': extracted_sections.get('Qualifications', ''),
        'Responsibilities': extracted_sections.get('Responsibilities', ''),
    }
    
    key_terms_list.append(key_terms_entry)

key_terms_df = pd.DataFrame(key_terms_list)

key_terms_df

Unnamed: 0,Title,Link,Qualifications,Responsibilities
0,Data Scientist,https://www.indeed.com/rc/clk?jk=78e6280457103...,"Bachelor’s or Master’s in Computer Science, St...",Assist in cleaning and preprocessing structure...
1,Data Scientist,https://www.indeed.com/rc/clk?jk=a4f3e98fa98cb...,"Bachelor’s or Master’s in Computer Science, St...",Assist in cleaning and preprocessing structure...
2,Data Scientist,https://www.indeed.com/viewjob?jk=cdef01234567...,,
3,Data Scientist (Tapestry),https://www.indeed.com/rc/clk?jk=8bb8e7ae29b53...,,
4,"Research Scientist, Foundation Model (LLM)",https://www.indeed.com/rc/clk?jk=780362905ef9d...,"1. Excellent coding ability, data structures, ...",With a long-term vision and a strong commitmen...
5,Data Scientist,https://www.indeed.com/rc/clk?jk=2ad5b66e52509...,Self-starter with good communication skills\nP...,Extract data from multiple data sources like M...
6,"Data Scientist (Dublin, CA)",https://www.indeed.com/rc/clk?jk=f33e369c31172...,,Perform exploratory and statistical data analy...
7,Machine Learning Engineer,https://www.indeed.com/rc/clk?jk=54d0e2fb38ac1...,,"Design, develop, and implement machine learnin..."
8,Data Scientist,https://www.indeed.com/rc/clk?jk=18dd050faef8e...,"Bachelor’s or Master’s in Computer Science, Da...",Design and implement ETL/ELT pipelines using D...
9,Machine Learning Engineer,https://www.indeed.com/rc/clk?jk=674087e716231...,,
