In [4]:
%pip install firecrawl-py
%pip install python-dotenv



Note: you may need to restart the kernel to use updated packages.


## API KEY

In [5]:
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("FIRECRAWL_API_KEY")



## Scrape the links of career/join us page from given url

In [15]:
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field


def get_career_links(url: str) -> list:
    """
    Extracts career-related links from a given URL using Firecrawl.

    Args:
        url (str): The URL of the website to extract links from.
        api_key (str): Your Firecrawl API key.

    Returns:
        list: A list of career-related links found on the page.
    """
    app = FirecrawlApp(api_key)

    response = app.scrape_url(url=url, params={
        'formats': ['links'],
        'onlyMainContent': False,
        'actions': [
            {
                "type": "wait",
                "milliseconds": 2,
            }
        ],
    })

    keywords = ["careers", "career", "job", "jobs","join"]
    career_links = []
    if 'links' in response:
        for link in response['links']:
            if any(keyword in link.lower() for keyword in keywords):
                career_links.append(link)
    else:
        print("The 'links' key was not found in the response.")

    return career_links


def add_wildcard_to_links(links: list) -> list:
    """
    Adds "/*" to the end of each URL in a list.

    Args:
        links (list): A list of URLs.

    Returns:
        list: A list of URLs with "/*" appended.
    """
    return [link + "/*" if not link.endswith("/*") else link for link in links]


# Example usage:
url = "https://www.amazon.com/"

career_links = get_career_links(url)

# Add wildcard and print updated links
updated_links = add_wildcard_to_links(career_links)



##  Extract job listings in json

In [16]:
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List
import json

class PositionDept(BaseModel):
    position: str
    dept: str
    location: str
    description: str
    required_experience_in_years: str

class ExtractSchema(BaseModel):
    jobs: List[PositionDept]

def extract_job_postings(urls: List[str]) -> List[PositionDept]:
    """
    Extracts job postings from multiple URLs using Firecrawl and returns them.

    Args:
        urls (List[str]): A list of URLs to extract job postings from.
        api_key (str): Your Firecrawl API key.

    Returns:
        List[PositionDept]: A list of job postings.
    """
    app = FirecrawlApp(api_key=api_key)
    all_jobs = []

    for url in urls:
        data = app.extract([url], {
            'prompt': 'Extract Every Single job posting. For each job posting, find the open position they are hiring for and the department that position is for. Both of these will be in the website data. Also include location, description and required experience for that job if its data is available. eg {position:"Data Engineer", department:"Engineering"}',
            'schema': ExtractSchema.model_json_schema(),
        })

        if 'jobs' in data.get('data', {}) and data['data']['jobs']:
            all_jobs.extend(data['data']['jobs'])

    return all_jobs

def print_job_postings(jobs: List[PositionDept]):
    """
    Prints job postings.

    Args:
        jobs (List[PositionDept]): A list of job postings.
    """
    for job in jobs:
        print(json.dumps(job if isinstance(job, dict) else job.dict(), indent=4))
        print("-" * 20)

# Example usage:
urls = updated_links

jobs = extract_job_postings(urls)
print_job_postings(jobs)

{
    "dept": "Operations",
    "location": "Not specified",
    "position": "Area Manager",
    "description": "In this role, you will lead a team of hourly workforce and will be responsible for engaging your team during their shifts to maintain the highest levels of safety, quality, attendance, and performance. You will also play a key role in maintaining our customer expectations to ensure customer orders are delivered at the right time, to the right location.",
    "required_experience_in_years": "2+"
}
--------------------
{
    "dept": "Fulfillment Center",
    "location": "Not specified",
    "position": "Area Manager",
    "description": "Amazon is seeking Area Managers for our Fulfillment Centers (FCs). You will focus on safety, quality, customer experience, and productivity of your department. Responsibilities include reviewing work forecasts, partnering with other Area Managers, supporting safety programs, and leading process improvement initiatives.",
    "required_experien

In [17]:
from typing import List, Dict

def analyze_jobs(jobs: List[Dict]):
    """
    Analyzes job postings to count total jobs, jobs per department, and jobs per location.

    Args:
        jobs (List[Dict]): A list of job postings.
    """
    # Get total count
    total_jobs = len(jobs)
    print(f"Total number of jobs: {total_jobs}")
    
    dept_counts = {}
    location_counts = {}
    
    for job in jobs:
        dept = job['dept']
        location = job['location']
        
        dept_counts[dept] = dept_counts.get(dept, 0) + 1
        location_counts[location] = location_counts.get(location, 0) + 1

    # Print department results
    print("\nJobs per department:")
    for dept, count in dept_counts.items():
        print(f"{dept}: {count}")
    
    # Print location results
    print("\nJobs per location:")
    for location, count in location_counts.items():
        print(f"{location}: {count}")

    return total_jobs



total_jobs = analyze_jobs(jobs)

Total number of jobs: 82

Jobs per department:
Operations: 20
Fulfillment Center: 1
Fulfillment Centers: 1
Rapid and Rural Logistics (R2L): 1
Amazon Web Services: 1
Workplace Design & Construction: 1
Customer Engagement Technology: 5
Central Flow - Central Ops: 1
Data and Machine Learning: 1
WWPS US Federal: 1
US Federal Data and Machine Learning: 1
Logistics for Good: 1
People eXperience and Technology: 2
Business Operations, Customer Logistics Security (CLS): 1
Worldwide Operations: 2
Global Tax Services: 1
Trusted Order and Remediation Experiences: 1
Amazon AI: 1
Customer Service: 2
Amazon Vendor Services: 2
Amazon Music Search: 1
Risk Management: 1
Global Specialty Fulfillment: 1
Amazon Transportation Service (ATS): 1
Pharmacy Operations: 1
Customer Fulfilment: 1
People Experience and Technology (PXT): 1
Inclusive Experience and Technology: 1
Amazon Extra Large (AMXL): 1
Customer Fulfillment: 1
Amazon Transportation: 1
Listing Services, Core Selling Partner Experience: 1
Human Reso