<a href="https://colab.research.google.com/github/KevinJayne2023/agent_recruiter_screening_langgraph/blob/main/agent_recruiter_screening_langgraph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recruiter Screening Notes — LangChain + LangGraph

Paste raw screening-call notes → get polished **Screening Notes** (Markdown) and **5 job title suggestions** with rationales.

**What this notebook does**
- Extracts structured data from messy notes (name, experience, skills, comp, risks...)
- Normalizes skills
- Suggests 5 job titles with brief rationales
- Produces a final Markdown summary + JSON payload you can paste to your ATS/CRM

**Stack**: LangChain, LangGraph, OpenAI (or swap model), Pydantic, RapidFuzz


## 1) Install Dependencies

In [None]:
!pip -q install "langchain>=0.2.10" "langgraph>=0.2.20" "langchain-openai>=0.1.20" "pydantic>=2.7.0" "rapidfuzz>=3.7.0" "python-dotenv>=1.0.1"

## 2) Configure API key

In [None]:
import os, getpass
if not os.environ.get("OPENAI_API_KEY"):
    try:
        os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OPENAI_API_KEY (input hidden): ")
    except Exception as e:
        print("You can also set it manually: os.environ['OPENAI_API_KEY'] = 'sk-...' ")
print("API key set:", bool(os.environ.get("OPENAI_API_KEY")))

## 3) Define models, helpers, and ontology

In [None]:
from __future__ import annotations
from typing import List, Optional, Dict, Any, TypedDict
from dataclasses import dataclass
from pydantic import BaseModel, Field
import os, re, json
from rapidfuzz import fuzz
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langgraph.graph import StateGraph, END
from IPython.display import Markdown, display

class ExperienceItem(BaseModel):
    title: Optional[str] = None
    company: Optional[str] = None
    start: Optional[str] = None
    end: Optional[str] = None
    highlights: List[str] = Field(default_factory=list)

class ScreeningExtraction(BaseModel):
    candidate_name: Optional[str] = None
    current_title: Optional[str] = None
    current_company: Optional[str] = None
    years_experience: Optional[float] = None
    location: Optional[str] = None
    work_authorization: Optional[str] = None
    remote_on_site_pref: Optional[str] = None
    comp_current: Optional[str] = None
    comp_expected: Optional[str] = None
    notice_period: Optional[str] = None
    earliest_start_date: Optional[str] = None
    role_preferences: List[str] = Field(default_factory=list)
    top_skills: List[str] = Field(default_factory=list)
    tools_stack: List[str] = Field(default_factory=list)
    industries: List[str] = Field(default_factory=list)
    risk_flags: List[str] = Field(default_factory=list)
    notable_quotes: List[str] = Field(default_factory=list)
    experience: List[ExperienceItem] = Field(default_factory=list)
    summary_paragraph: Optional[str] = None

class TitleSuggestion(BaseModel):
    title: str
    rationale: str
    confidence: float = Field(ge=0, le=1)

class ScreenState(TypedDict, total=False):
    raw_notes: str
    cleaned_notes: str
    extracted: ScreeningExtraction
    normalized_skills: List[str]
    title_suggestions: List[TitleSuggestion]
    formatted_markdown: str
    json_payload: Dict[str, Any]

def make_llm(model: str = "gpt-4o-mini", temperature: float = 0.1):
    return ChatOpenAI(model=model, temperature=temperature)

def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

# --- Canonical skills ---
CANONICAL_SKILLS = {
    "python": ["py", "python3"],
    "java": [],
    "javascript": ["js", "ecmascript"],
    "typescript": ["ts"],
    "react": ["reactjs", "react.js"],
    "node": ["nodejs", "node.js"],
    "sql": [],
    "aws": ["amazon web services"],
    "gcp": ["google cloud"],
    "azure": [],
    "docker": [],
    "kubernetes": ["k8s", "kube"],
    "spark": ["pyspark"],
    "hadoop": [],
    "airflow": [],
    "terraform": [],
    "ci/cd": ["cicd", "ci cd"],
    # data/ml
    "pandas": [],
    "numpy": [],
    "pytorch": [],
    "tensorflow": ["tf", "tfx"],
    "sklearn": ["scikit-learn"],
    "mlops": ["sagemaker", "vertex ai", "mlflow"],
    "nlp": ["natural language processing"],
    "computer vision": ["cv"],
    "llm": ["large language model", "gpt", "llama"],
    # RAG / LLM tooling
    "langchain": [],
    "langgraph": [],
    "prompt engineering": ["prompting", "system prompt", "few-shot"],
    "rag": ["retrieval-augmented generation", "retrieval"],
    "faiss": [],
    "pgvector": [],
    "pinecone": [],
    "openai": [],
    "anthropic": [],
    # analytics / data tools
    "dbt": [],
    # sre/devops
    "prometheus": [],
    "grafana": [],
    # qa/security
    "selenium": [],
    "cypress": [],
    "pytest": [],
    "security": [],
    "iam": [],
    "kms": [],
    "siem": [],
}

TITLE_ONTOLOGY = [
    "Software Engineer (Backend)",
    "Software Engineer (Frontend)",
    "Full-Stack Engineer",
    "Data Engineer",
    "Machine Learning Engineer",
    "MLOps Engineer",
    "Data Scientist",
    "Analytics Engineer",
    "DevOps Engineer",
    "Site Reliability Engineer (SRE)",
    "QA Automation Engineer",
    "Security Engineer",
    "Solutions Architect",
    "Product Manager (Technical)",
    "AI Engineer",
    "Prompt Engineer",
]

TITLE_KEYWORDS = {
    "Software Engineer (Backend)": ["python", "java", "node", "sql", "aws", "gcp", "docker", "kubernetes"],
    "Software Engineer (Frontend)": ["javascript", "typescript", "react"],
    "Full-Stack Engineer": ["react", "node", "python", "typescript"],
    "Data Engineer": ["spark", "airflow", "python", "sql", "hadoop", "aws"],
    "Machine Learning Engineer": ["pytorch", "tensorflow", "sklearn", "mlops", "llm", "nlp", "computer vision"],
    "MLOps Engineer": ["mlops", "kubernetes", "docker", "aws", "gcp", "mlflow", "sagemaker", "vertex ai"],
    "Data Scientist": ["python", "pandas", "numpy", "sklearn", "nlp", "llm"],
    "Analytics Engineer": ["sql", "dbt", "warehouse", "etl", "airflow"],
    "DevOps Engineer": ["aws", "gcp", "azure", "kubernetes", "docker", "terraform", "ci/cd"],
    "Site Reliability Engineer (SRE)": ["kubernetes", "observability", "prometheus", "grafana", "incident"],
    "QA Automation Engineer": ["selenium", "cypress", "pytest"],
    "Security Engineer": ["security", "iam", "kms", "siem"],
    "Solutions Architect": ["pre-sales", "solutions", "customer", "cloud", "architecture"],
    "Product Manager (Technical)": ["product", "requirements", "backlog", "roadmap", "stakeholder"],
    "AI Engineer": [
        "llm", "rag", "langchain", "langgraph", "embeddings", "vector", "faiss", "pgvector",
        "pinecone", "openai", "anthropic", "function calling", "tools", "agents", "retrieval",
        "guardrails", "evaluation", "observability", "python", "pydantic"
    ],
    "Prompt Engineer": [
        "prompt", "few-shot", "system prompt", "templates", "structured output", "json",
        "function calling", "tools", "eval", "evaluation", "guardrail", "jailbreak",
        "injection", "rag", "retrieval", "langchain", "langgraph", "rubrics", "A/B"
    ],
}

def normalize_skills(skills: List[str]) -> List[str]:
    canon = set()
    for raw in skills:
        if not raw:
            continue
        low = raw.lower().strip()
        hit = False
        for key, aliases in CANONICAL_SKILLS.items():
            if low == key or low in aliases:
                canon.add(key)
                hit = True
                break
        if hit:
            continue
        best = None
        best_score = 0
        for key, aliases in CANONICAL_SKILLS.items():
            for term in [key] + aliases:
                score = fuzz.partial_ratio(low, term)
                if score > best_score:
                    best, best_score = key, score
        if best and best_score >= 85:
            canon.add(best)
        else:
            canon.add(raw)
    return sorted(canon)

def score_titles(skills: List[str], text_blob: str) -> List[tuple[str, float]]:
    low_text = text_blob.lower()
    scores = []
    skillset = set([s.lower() for s in skills])
    for title in TITLE_ONTOLOGY:
        kws = TITLE_KEYWORDS.get(title, [])
        hit_skill = len(skillset.intersection(set(kws)))
        hit_text = sum(1 for k in kws if k in low_text)
        score = hit_skill * 1.0 + hit_text * 0.5
        scores.append((title, score))
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores


## 4) Build the LangGraph and nodes

In [None]:
def preprocess_node(state: ScreenState) -> ScreenState:
    notes = state.get("raw_notes", "")
    cleaned = normalize_text(notes)
    return {"cleaned_notes": cleaned}

def extract_node(state: ScreenState) -> ScreenState:
    llm = make_llm()
    prompt = ChatPromptTemplate.from_messages([
        ("system",
         "You are a meticulous recruiter’s assistant. Extract structured fields from the call notes. "
         "If a field is unknown, leave it null or empty. Do not invent facts."),
        ("human",
         "Call notes:\n\n{notes}\n\n"
         "Return ONLY a JSON object matching this Pydantic schema:\n"
         f"{ScreeningExtraction.schema_json(indent=2)}")
    ])
    chain = prompt | llm.with_structured_output(ScreeningExtraction)
    extracted: ScreeningExtraction = chain.invoke({"notes": state["cleaned_notes"]})
    return {"extracted": extracted}

def normalize_skills_node(state: ScreenState) -> ScreenState:
    extracted: ScreeningExtraction = state["extracted"]
    combined = list({*(extracted.top_skills or []), *(extracted.tools_stack or [])})
    norm = normalize_skills(combined)
    return {"normalized_skills": norm}

def suggest_titles_node(state: ScreenState) -> ScreenState:
    extracted: ScreeningExtraction = state["extracted"]
    skills = state.get("normalized_skills", [])
    text_blob = " ".join([
        state.get("cleaned_notes", ""),
        " ".join(skills),
        extracted.summary_paragraph or "",
        (extracted.current_title or "") + " " + " ".join(extracted.role_preferences or [])
    ])
    base_scores = score_titles(skills, text_blob)
    top8 = [t for t, s in base_scores[:8] if s > 0] or [t for t, _ in base_scores[:8]]
    llm = make_llm(temperature=0.2)
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You propose 5 job titles a candidate fits and explain why. Calibrate confidence 0.6–0.95."),
        ("human",
         "Candidate context (JSON):\n"
         f"{state['extracted'].json()}\n\n"
         f"Normalized skills: {skills}\n"
         f"Title candidates to choose from: {top8}\n\n"
         "Return a JSON array of objects {title, rationale, confidence} (5 items).")
    ])
    parser = StrOutputParser()
    raw = (prompt | llm | parser).invoke({})
    try:
        parsed = json.loads(raw)
        suggestions = [TitleSuggestion(**d) for d in parsed][:5]
    except Exception:
        suggestions = [
            TitleSuggestion(title=t, rationale="Matches skill keywords and experience signals.", confidence=0.7)
            for t, _ in base_scores[:5]
        ]
    return {"title_suggestions": suggestions}

def format_markdown_node(state: ScreenState) -> ScreenState:
    e: ScreeningExtraction = state["extracted"]
    titles: List[TitleSuggestion] = state["title_suggestions"]
    skills = ", ".join(state.get("normalized_skills", []))

    md = []
    md.append(f"# Screening Notes — {e.candidate_name or 'Candidate'}")
    md.append("")
    md.append("## Overview")
    md.append(e.summary_paragraph or "—")
    md.append("")
    md.append("## Snapshot")
    md.append(f"- **Current Title/Company:** {e.current_title or '—'} @ {e.current_company or '—'}")
    md.append(f"- **Location:** {e.location or '—'}")
    md.append(f"- **Years Experience:** {e.years_experience if e.years_experience is not None else '—'}")
    md.append(f"- **Work Authorization:** {e.work_authorization or '—'}")
    md.append(f"- **Work Preference:** {e.remote_on_site_pref or '—'}")
    md.append(f"- **Comp (Current):** {e.comp_current or '—'}")
    md.append(f"- **Comp (Expected):** {e.comp_expected or '—'}")
    md.append(f"- **Notice / Availability:** {e.notice_period or '—'} / {e.earliest_start_date or '—'}")
    md.append("")
    md.append("## Role Preferences")
    md.append(("; ".join(e.role_preferences)) if e.role_preferences else "—")
    md.append("")
    md.append("## Skills & Stack")
    md.append(skills or "—")
    md.append("")
    if e.experience:
        md.append("## Experience Highlights")
        for item in e.experience:
            line = f"- **{item.title or '—'}**, {item.company or '—'}"
            if item.start or item.end:
                line += f" ({item.start or ''}–{item.end or ''})"
            md.append(line)
            for h in (item.highlights or []):
                md.append(f"  - {h}")
        md.append("")
    if e.risk_flags:
        md.append("## Risks / Concerns")
        for r in e.risk_flags:
            md.append(f"- {r}")
        md.append("")
    if e.notable_quotes:
        md.append("## Notable Quotes")
        for q in e.notable_quotes:
            md.append(f"> {q}")
        md.append("")
    md.append("## Suggested Job Titles (Top 5)")
    for i, t in enumerate(titles[:5], 1):
        md.append(f"{i}. **{t.title}** — {t.rationale} _(confidence: {t.confidence:.2f})_")

    payload = {
        "extracted": json.loads(e.json()),
        "normalized_skills": state.get("normalized_skills", []),
        "title_suggestions": [t.dict() for t in titles[:5]],
    }
    return {"formatted_markdown": "\n".join(md), "json_payload": payload}

def build_graph():
    sg = StateGraph(ScreenState)
    sg.add_node("preprocess", preprocess_node)
    sg.add_node("extract", extract_node)
    sg.add_node("normalize_skills", normalize_skills_node)
    sg.add_node("suggest_titles", suggest_titles_node)
    sg.add_node("format_markdown", format_markdown_node)
    sg.set_entry_point("preprocess")
    sg.add_edge("preprocess", "extract")
    sg.add_edge("extract", "normalize_skills")
    sg.add_edge("normalize_skills", "suggest_titles")
    sg.add_edge("suggest_titles", "format_markdown")
    sg.add_edge("format_markdown", END)
    return sg.compile()


## 5) Paste your call notes and run
Enter redacted notes below

In [None]:
RAW_NOTES = """
Add notes here
""".strip()
print(RAW_NOTES[:1000] + ("..." if len(RAW_NOTES)>1000 else ""))

## 6) Run the graph -> View results (Markdown + JSON)

In [None]:
graph = build_graph()
out = graph.invoke({"raw_notes": RAW_NOTES})
md = out["formatted_markdown"]
payload = out["json_payload"]
display(Markdown(md))
print("\n--- JSON payload ---\n")
print(json.dumps(payload, indent=2))

## 7) Save to files
Saves Markdown and JSON to `/content/`. You can also upload them to your ATS.

In [None]:
from pathlib import Path
Path('/content').mkdir(parents=True, exist_ok=True)
md_path = '/content/screening_notes.md'
json_path = '/content/screening_payload.json'
with open(md_path, 'w', encoding='utf-8') as f:
    f.write(md)
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(payload, f, indent=2)
print('Saved:', md_path)
print('Saved:', json_path)