<a href="https://colab.research.google.com/github/GaurRitika/LittleJobScrapping/blob/main/JobScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================
# STEP 1: INSTALL OLLAMA & SETUP (RUN ONCE)
# ==========================================
import subprocess
import time
import requests
import sys
import nest_asyncio

# 1. Install Python Libraries
print("‚öôÔ∏è Installing Dependencies...")
!pip install crawl4ai nest_asyncio pydantic requests fake-useragent > /dev/null 2>&1

# 2. CRITICAL: Install Playwright Browsers (Required for crawl4ai)
print("üåê Installing Browsers (Chromium)...")
!playwright install chromium > /dev/null 2>&1
!playwright install-deps > /dev/null 2>&1

# 3. Install Ollama (Linux/Colab)
print("‚öôÔ∏è Installing Ollama (AI Engine)...")
!curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1

# 4. Start Ollama Server in Background
print("üöÄ Starting Ollama Server...")
# Using nohup to ensure it stays running in the background
process = subprocess.Popen("nohup ollama serve > ollama.log 2>&1 &", shell=True)

# 5. Smart Wait (Health Check)
print("‚è≥ Waiting for Ollama to start...")
start_time = time.time()
server_ready = False

while time.time() - start_time < 60:  # Wait max 60 seconds
    try:
        response = requests.get("http://localhost:11434")
        if response.status_code == 200:
            server_ready = True
            print("‚úÖ Ollama Server is UP!")
            break
    except requests.exceptions.ConnectionError:
        time.sleep(2)
        continue

if not server_ready:
    print("‚ùå Error: Ollama failed to start. Check logs.")
    sys.exit(1)

# 6. Download Model
# Tip: 'llama3.2' is smaller/faster for extracting data than full 'llama3'
print("üß† Downloading Llama3 Model (This takes ~2-3 mins)...")
!ollama pull llama3

# 7. Apply Async Fix for Notebooks
nest_asyncio.apply()
print("\nüéâ SETUP COMPLETE! You can now run the scraper.")

‚öôÔ∏è Installing Dependencies...
üåê Installing Browsers (Chromium)...
‚öôÔ∏è Installing Ollama (AI Engine)...
üöÄ Starting Ollama Server...
‚è≥ Waiting for Ollama to start...
‚úÖ Ollama Server is UP!
üß† Downloading Llama3 Model (This takes ~2-3 mins)...
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?20

In [None]:
import asyncio

import nest_asyncio

import requests

import json

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode



nest_asyncio.apply()



# --- 1. AI EXTRACTOR (Keep this the same) ---

def extract_jobs_from_chunk(text_chunk: str, url: str):

    valid_indicators = ["job", "opening", "vacancy", "responsibilities", "requirements", "apply", "location"]

    if not any(k in text_chunk.lower() for k in valid_indicators):

        return []



    prompt = f"""

    You are a Data Analyst. Extract GENUINE job listings.

    TEXT SOURCE: {url}

    RULES:

    1. Ignore navbars/footers.

    2. Extract only Job Title, Company, Location.

    3. Output Valid JSON.



    TEXT:

    {text_chunk}



    JSON FORMAT:

    {{

        "jobs": [

            {{ "job_title": "...", "company": "...", "location": "...", "apply_link": "..." }}

        ]

    }}

    """

    try:

        response = requests.post(

            "http://localhost:11434/api/generate",

            json={"model": "llama3", "prompt": prompt, "stream": False, "format": "json"},

            timeout=120

        )

        if response.status_code != 200: return []

        return json.loads(response.json()['response']).get("jobs", [])

    except: return []



# --- 2. UPDATED CRAWLER WITH DELAY ---

async def crawl_target(url):

    print(f"üï∑Ô∏è Scraper starting for: {url}")



    # 1. Browser Config (Headless Chrome)

    browser_cfg = BrowserConfig(

        headless=True,

        verbose=True,

        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

    )



    # 2. RUN CONFIG (The Critical Fix)

    run_cfg = CrawlerRunConfig(

        # Wait 5 seconds for the JavaScript to finish loading jobs

        delay_before_return_html=5.0,

        # Scroll to bottom to trigger any lazy-loading elements

        js_code="""

            window.scrollTo(0, document.body.scrollHeight);

            setTimeout(() => window.scrollTo(0, 0), 1000);

            setTimeout(() => window.scrollTo(0, document.body.scrollHeight), 2000);

        """

    )



    async with AsyncWebCrawler(config=browser_cfg) as crawler:

        # Pass the 'run_cfg' here so it knows to wait!

        result = await crawler.arun(url, config=run_cfg)



        if not result.success:

            print("‚ùå Failed to load page.")

            return



        # Check if we actually got text

        markdown_text = result.markdown.raw_markdown

        print(f"üìÑ Scraped Text Length: {len(markdown_text)} characters")



        if len(markdown_text) < 1000:

            print("‚ö†Ô∏è WARNING: Page seems empty. The delay might still be too short.")



        chunk_size = 4000

        chunks = [markdown_text[i:i+chunk_size] for i in range(0, len(markdown_text), chunk_size)]



        print(f"üì¶ Processing {len(chunks)} chunks of text...")



        all_found_jobs = []

        for i, chunk in enumerate(chunks):

            print(f"   üëâ Analyzing chunk {i+1}/{len(chunks)}...")

            jobs = extract_jobs_from_chunk(chunk, url)

            if jobs:

                all_found_jobs.extend(jobs)

                print(f"      ‚úÖ Found {len(jobs)} jobs in this chunk.")



        print("\nüéØ GENUINE JOBS FOUND:")

        print(json.dumps(all_found_jobs, indent=2))



# --- RUN IT ---

target_url = "https://wellfound.com/jobs"

asyncio.run(crawl_target(target_url))



üï∑Ô∏è Scraper starting for: https://wellfound.com/jobs


üìÑ Scraped Text Length: 1 characters
üì¶ Processing 1 chunks of text...
   üëâ Analyzing chunk 1/1...

üéØ GENUINE JOBS FOUND:
[]
