In [1]:
pip install jobspy

Note: you may need to restart the kernel to use updated packages.


In [1]:
"""
robust_jd_pipeline.py
-----------------------------------------
Scrape, clean, LLM-parse, vector-embed & store
job descriptions that match a hiring
manager’s brief.

Dependencies  (pip install …)
--------------------------------
jobspy==0.4.*
beautifulsoup4
requests
sentence-transformers
chromadb
ollama-python       # or llama-cpp-python
tqdm
pydantic
"""

from __future__ import annotations
import re, os, json, random, logging, pathlib, datetime as dt
from typing import Sequence, Tuple
import requests
import pandas as pd
from tqdm import tqdm
from pydantic import BaseModel, ValidationError
from jobspy import scrape_jobs
import chromadb
from sentence_transformers import SentenceTransformer
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi2")            # pre-pulled via `ollama pull phi2`


ModuleNotFoundError: No module named 'jobspy'

In [None]:
def llm_extract(text: str) -> dict[str, str | int | list[str]]:
    """
    Feed a JD and get back a JSON dict with {title, company, exp_years, skills, …}.
    The pydantic schema (below) guarantees type safety.
    """
    system = (
        "You extract structured info from job ads. "
        "Return *ONLY* valid JSON matching this schema:\n"
        "{"
        '"title": str, "company": str, "experience_years": int, '
        '"location": str, "skills": [str], "education": str'
        "}"
    )
    prompt = f"{system}\n\n### JOB AD\n{text}\n\n### JSON:"
    resp = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.0}},
        timeout=90,
    )
    resp.raise_for_status()
    raw = resp.json()["response"].strip()
    # guard: sometimes model wraps in markdown code-fence
    raw = raw.lstrip("`").rstrip("`")
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        logging.warning("LLM returned invalid JSON, skipping.")
        return {}

In [None]:

# Pydantic schema to validate / coerce
class JD(BaseModel):
    title: str
    company: str
    experience_years: int | None = None
    location: str | None = None
    skills: list[str] = []
    education: str | None = None
    description: str                   # keep full text
    url: str
    posted_dt: dt.datetime
    search_term: str

# ──────────────────────────────────────────────────────────
# 2. Embedding & Vector DB (Chroma, local)
# ──────────────────────────────────────────────────────────
import chromadb
from sentence_transformers import SentenceTransformer

EMB_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
CHROMA_DIR = pathlib.Path("data/jd_vectors").as_posix()
client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(name="job_descriptions")

# ──────────────────────────────────────────────────────────
# 3. Main pipeline
# ──────────────────────────────────────────────────────────
def build_and_store_jds(
    job_title: str,
    exp_range: Tuple[int, int],
    location: str = "United States",
    sector_keywords: Sequence[str] | None = None,
    hours_old: int = 72,
    results_each: int = 50,
    extra_google: bool = True,
) -> pd.DataFrame:
    """
    • Scrape LinkedIn (JobSpy)  [+ optional Google Jobs]
    • Clean & de-dup
    • LLM-parse → JD schema
    • Filter by experience range & sector
    • Embed + upsert into Chroma
    Returns final DataFrame of stored docs
    """
    # 1  SCRAPE  –––––––––––––––––––––––––––––––––––
    frames: list[pd.DataFrame] = []
    print("Scraping LinkedIn via JobSpy…")
    df_ln = scrape_jobs(
        site_name="linkedin",
        search_term=job_title,
        location=location,
        job_type="fulltime",
        hours_old=hours_old,
        results_wanted=results_each,
        linkedin_fetch_description=True,
        description_format="markdown",
    )
    frames.append(pd.DataFrame(df_ln))

    if extra_google:
        print("Scraping Google Jobs via JobSpy…")
        df_gg = scrape_jobs(
            site_name="google",
            search_term=job_title,
            location=location,
            hours_old=hours_old,
            results_wanted=results_each,
            description_format="markdown",
        )
        frames.append(pd.DataFrame(df_gg))

    raw = pd.concat(frames, ignore_index=True)
    raw["posted_dt"] = pd.to_datetime(raw["posted_time"])
    raw["description"] = raw["description"].fillna("")

    # 2  DE-DUP  –––––––––––––––––––––––––––––––––––
    raw = (
        raw.sort_values("posted_dt", ascending=False)
        .drop_duplicates(subset="job_url", keep="first")
        .reset_index(drop=True)
    )
    print(f"{len(raw):,} unique ads after URL dedup.")

    # 3  LLM PARSE  ––––––––––––––––––––––––––––––––
    records: list[dict] = []
    for _, row in tqdm(raw.iterrows(), total=len(raw), desc="LLM parse"):
        info = llm_extract(row["description"])
        if not info:
            continue
        try:
            jd = JD(
                **info,
                description=row["description"],
                url=row["job_url"],
                posted_dt=row["posted_dt"],
                search_term=row["search_term"],
            )
            records.append(jd.model_dump())
        except ValidationError:
            continue

    df = pd.DataFrame(records)
    print(f"{len(df):,} ads passed JSON validation.")

    # 4  FILTER  (by exp & sector keywords) –––––––
    low, high = exp_range
    if df.empty:
        print("No valid JDs to filter.")
        return df

    df = df[df["experience_years"].between(low, high, inclusive="both")]
    if sector_keywords:
        pat = re.compile("|".join(map(re.escape, sector_keywords)), re.I)
        df = df[
            df["description"].str.contains(pat)
            | df["company"].str.contains(pat, case=False, na=False)
        ]
    print(f"{len(df):,} ads after exp & sector filters.")

    if df.empty:
        return df

    # 5  EMBED & UPSERT  –––––––––––––––––––––––––––
    embeds = EMB_MODEL.encode(df["description"].tolist(), show_progress_bar=True)
    # ensure deterministic ids
    ids = [f"{hash(url)%2**63}" for url in df["url"]]
    metadata = df.drop(columns=["description"]).to_dict(orient="records")

    collection.upsert(
        ids=ids,
        embeddings=embeds,
        metadatas=metadata,
        documents=df["description"].tolist(),
    )
    print(f"Upserted {len(ids)} vectors into Chroma → {CHROMA_DIR}")

    return df


# ── Example call ────────────────────────────────────────────
if __name__ == "__main__":
    df_final = build_and_store_jds(
        job_title="Procurement Manager",
        exp_range=(12, 15),
        location="United States",
        sector_keywords=["automotive", "OEM", "vehicle", "tier-1"],
        hours_old=168,                      # 7 days for niche roles
        results_each=60,
    )
    print(df_final.head(3))


In [None]:
df.columns

In [None]:
df.to_csv('scrpaed_jobs.csv',index=False)

In [None]:
import os
import pandas as pd
import openai
import json
from docx import Document
from tqdm import tqdm
from dotenv import load_dotenv
env_path = 'C:\Outlook_Email_Generator\config.env'
load_dotenv(dotenv_path=env_path)

# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

job_title_folder_map = {
    # Data Science
    "data scientist": "Data_Scientist",
    "data science": "Data_Scientist",
    "research scientist": "Data_Scientist",
    "applied scientist": "Data_Scientist",
    "scientist": "Data_Scientist",

    # Data Engineering
    "data engineer": "Data_Engineer",
    "data engineering": "Data_Engineer",
    "etl engineer": "Data_Engineer",
    "big data": "Data_Engineer",
    "pipeline engineer": "Data_Engineer",
    "sql engineer": "Data_Engineer",
    "database engineer": "Data_Engineer",

    # Machine Learning
    "ml engineer": "ML_Engineer",
    "machine learning": "ML_Engineer",
    "ai engineer": "ML_Engineer",
    "artificial intelligence": "ML_Engineer",
    "generative ai": "ML_Engineer",
    "gen ai": "ML_Engineer",
    "deep learning": "ML_Engineer",
    "computer vision": "ML_Engineer",
    "llm": "ML_Engineer",
    "nlp": "ML_Engineer",
    "mlops": "ML_Engineer",
    "ml researcher": "ML_Engineer",

    # Data Analytics / BI
    "data analyst": "Data_Analyst",
    "business analyst": "Data_Analyst",
    "bi analyst": "Data_Analyst",
    "business intelligence": "Data_Analyst",
    "data analytics": "Data_Analyst",
    "analytics": "Data_Analyst",
    "analyst": "Data_Analyst",
    "reporting analyst": "Data_Analyst",
    "sql analyst": "Data_Analyst",
    "marketing analyst": "Data_Analyst",
    "insights analyst": "Data_Analyst"
}

# --- Step 1: Extract resume text ---
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])

def load_resumes(folder_path):
    resumes = []
    for file in os.listdir(folder_path):
        if file.endswith(".docx"):
            full_path = os.path.join(folder_path, file)
            text = extract_text_from_docx(full_path)
            resumes.append({"resume_path": full_path, "resume_text": text})
    return resumes

def resolve_resume_folder(base_folder, job_title):
    job_title = job_title.lower()
    for keyword, folder in job_title_folder_map.items():
        if keyword in job_title:
            return os.path.join(base_folder, folder), folder
    return None, None

# --- Step 2: GPT-4 Prompt ---
def find_best_resume_and_experience(job_title, company, job_description, resumes):
    resume_chunks = "\n\n".join([
        f"Resume {i+1}:\nPath: {r['resume_path']}\nText: {r['resume_text']}" for i, r in enumerate(resumes)
    ])
    client = openai.OpenAI(api_key=api_key)
    prompt = f"""
        You are an experienced technical recruiter.

        You have two tasks:

        **Task 1: Resume Match**
        Evaluate the following {len(resumes)} candidate resumes and select the ONE that is most suitable for the job described below.

        **Task 2: Experience Extraction**
        Carefully read the job description and extract the required years of experience **only if it is explicitly stated**.

        The experience must be formatted in one of the following ways:
        - A single integer, e.g., `"3"` → means exactly 3 years
        - A range in the format `"0-3"` → means 0 to 3 years
        - A minimum threshold in the format `"3+"` → means 3 or more years

        If no such requirement is mentioned clearly in the job description, return `null`.


        ---

        **Job Title:** {job_title}
        **Company:** {company}
        **Job Description:**
        {job_description}

        ---

        **Candidate Resumes:**
        {resume_chunks}

        ---

        **Return your answer in the following JSON format:**
        {{ "best_resume_path": "<resume_path>", "required_experience_years": "<3 | 0-3 | 3+ | null>", "experience_source": "extracted" | "none" }}

        - Take your time and think like a technical  recruiter before deciding the best resume path
        - Only use `"extracted"` if the experience is clearly stated in the job description.
        - Use `"none"` if no specific requirement is mentioned.
        - Do not guess based on job title or seniority.

        Return only the JSON response, nothing else.
        """   
    print('Here')
    try:
        response = client.chat.completions.create( 
            model="gpt-4o",
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        content = response.choices[0].message.content.strip()
        if content.startswith("```json"):
            content = content.replace("```json", "").strip()
        if content.endswith("```"):
            content = content[:-3].strip()
        
        # Print the raw content for debugging (optional)
        print("🔍 Cleaned GPT Output:\n", content)
        
        # Try parsing JSON
        result = json.loads(content)
        
        return result["best_resume_path"], result["required_experience_years"], result["experience_source"]

    except json.JSONDecodeError as jde:
        print("❌ JSON parse error:", jde)
        print("⚠️ GPT response was:", content)
        return None, None, None

    except Exception as e:
        print(f"❌ GPT call failed: {e}")
        return None, None, None

# --- Step 3: Annotate jobs with best resume ---
# Master function to attach resume and experience info to job DataFrame
def match_resumes_to_jobs(job_df, base_resume_folder):
    best_paths, exp_years_list, exp_sources = [], [], []

    for index, row in tqdm(job_df.iterrows(), total=len(job_df), desc="Matching resumes"):
        job_title = row.get("title", "")
        company = row.get("company", "")
        description = row.get("description", "")

        if not all([job_title, company, description]) or pd.isna(job_title) or pd.isna(company) or pd.isna(description):
            best_paths.append(None)
            exp_years_list.append(None)
            exp_sources.append(None)
            #folders_used.append(None)
            continue

        resume_folder, folder_name = resolve_resume_folder(base_resume_folder, job_title)

        if not resume_folder or not os.path.exists(resume_folder):
            print(f"⚠️ Skipping row {index} — no folder match for: {job_title}")
            best_paths.append(None)
            exp_years_list.append(None)
            exp_sources.append(None)
            #folders_used.append(None)
            continue

        resumes = load_resumes(resume_folder)

        if not resumes:
            best_paths.append(None)
            exp_years_list.append(None)
            exp_sources.append(None)
            #folders_used.append(folder_name)
            continue

        best_path, exp_years, exp_source = find_best_resume_and_experience(job_title, company, description, resumes)

        best_paths.append(best_path)
        exp_years_list.append(exp_years)
        exp_sources.append(exp_source)
        #folders_used.append(folder_name)

    job_df["best_resume_path"] = best_paths
    job_df["required_experience_years"] = exp_years_list
    job_df["experience_source"] = exp_sources
    #job_df["resume_folder_used"] = folders_used

    return job_df

In [None]:
jobs_df = df.head(2) 

In [None]:
 # From your jobspy scraping code
job_df=jobs_df.copy()
resumes_folder = "Resumes"
updated_df = match_resumes_to_jobs(jobs_df, resumes_folder)
#updated_df.to_csv("jobs_with_best_resumes.csv", index=False)

In [None]:
updated_df.loc[0,'description']

In [None]:
prompt