In [1]:
import sys
sys.path.append("..")

In [2]:
from job_agent.discover.companies import get_company_list
from job_agent.ingest.greenhouse import fetch_jobs
from job_agent.ingest.parser import parse_greenhouse_job
from job_agent.memory.store import JobStore
from job_agent.models.profile import UserProfile
from job_agent.rank.rules import passes_rules
from job_agent.rank.signals import extract_signals
from job_agent.rank.scorer import combine_scores
from job_agent.rank.text import job_text, profile_query
from job_agent.rank.lexical import LexicalScorer
from job_agent.rank.semantic import SemanticScorer
from job_agent.rank.signals import extract_signals
from job_agent.rank.scorer import combine_scores
from job_agent.enrich.fetch import fetch_job_page
from job_agent.enrich.extract import extract_job_text
from job_agent.memory.store import JobStore
from job_agent.generate.cover_letter import generate_cover_letter
from job_agent.generate.local_llm import LocalLLM
from job_agent.generate.resume import parse_resume_pdf
from job_agent.generate.openai_llm import OpenAILLM


import numpy as np
import sklearn
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
store = JobStore("job_agent/data/jobs.db")

user_companies = ["airbnb", "spotify", "adobe"]  # later comes from UI
companies = get_company_list(user_companies)

for company in companies:
    raw_jobs = fetch_jobs(company)
    if not raw_jobs:
        continue

    parsed_jobs = [parse_greenhouse_job(j, company) for j in raw_jobs]
    store.save_jobs(parsed_jobs)

[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/adobe/jobs
[ingest] no Greenhouse board found for 'adobe', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/airbnb/jobs
[ingest] 199 jobs found for 'airbnb'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/databricks/jobs
[ingest] 666 jobs found for 'databricks'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/google/jobs
[ingest] no Greenhouse board found for 'google', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/meta/jobs
[ingest] no Greenhouse board found for 'meta', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/spotify/jobs
[ingest] no Greenhouse board found for 'spotify', skipping
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/stripe/jobs
[ingest] 518 jobs found for 'stripe'
[ingest] fetching jobs from https://boards-api.greenhouse.io/v1/boards/u

In [5]:
for job in parsed_jobs[:100]:
    print(job)
    print("-" * 60)

JobPosting(job_id='7374073', company='stripe', title='Account Executive, AI Sales', location='SF', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7374073', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7374078', company='stripe', title='Account Executive, AI Sales', location='SF', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7374078', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7403230', company='stripe', title='Account Executive, Benelux & Nordics - Existing Business', location='Dublin', description='', apply_url='https://stripe.com/jobs/search?gh_jid=7403230', posted_date=datetime.date(2025, 12, 16), source='greenhouse')
------------------------------------------------------------
JobPosting(job_id='7423968', company='stripe', title='Account Executive, Ben

In [6]:
new_count = store.save_jobs(parsed_jobs)
print(f"Inserted {new_count} new jobs")

all_jobs = store.load_all_jobs()
print(f"Total jobs in DB: {len(all_jobs)}")

Inserted 0 new jobs
Total jobs in DB: 1391


In [7]:
profile = UserProfile(
    target_roles=["ml engineer", "data scientist", "data engineer"],
    core_skills=[
        "machine learning", "deep learning",
        "evaluation", "pytorch", "statistics"
    ],
    location=["US"]
)

## Hard Filtering >> Lexical and Semantic Scoring

In [8]:
import pandas as pd

df = pd.read_csv("../uscities.csv")
print(df.shape)

(31254, 17)


In [9]:
filtered_jobs = [
    job for job in all_jobs
    if passes_rules(job, profile)
]

print(f"Jobs before hard rules: {len(all_jobs)}")
print(f"Jobs after hard rules: {len(filtered_jobs)}")

Jobs before hard rules: 1391
Jobs after hard rules: 249


In [10]:
job_texts = [job_text(j) for j in filtered_jobs]
job_ids = [j.job_id for j in filtered_jobs]
query_text = profile_query(profile)

In [11]:
lex = LexicalScorer()
lex.fit(job_texts, job_ids)
lex_scores = lex.score_query(query_text)

In [12]:
sem = SemanticScorer("sentence-transformers/all-MiniLM-L6-v2")
query_emb = sem.embed([query_text])[0]
job_embs = sem.embed(job_texts)

In [13]:
rows = []

for job, emb in zip(filtered_jobs, job_embs):
    s_sem = sem.similarity(query_emb, emb)
    s_lex = lex_scores.get(job.job_id, 0.0)
    sigs = extract_signals(job, profile)

    rows.append({
        "job": job,
        "semantic": s_sem,
        "lexical": s_lex,
        "signals": sigs,
    })

In [14]:
top_semantic = sorted(
    rows,
    key=lambda r: r["semantic"],
    reverse=True
)[:10]

print("\n=== TOP 10 BY SEMANTIC SIMILARITY ===\n")

for r in top_semantic:
    job = r["job"]
    print(f"sem={r['semantic']:.3f}")
    print(f"{job.title} @ {job.company}")
    print("-" * 80)


=== TOP 10 BY SEMANTIC SIMILARITY ===

sem=0.407
Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
--------------------------------------------------------------------------------
sem=0.401
Staff Machine Learning Engineer - Community Support Engineering @ airbnb
--------------------------------------------------------------------------------
sem=0.359
Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
sem=0.353
Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
sem=0.333
Senior Software Engineer - Fullstack (NYC) @ databricks
--------------------------------------------------------------------------------
sem=0.328
Manager, Field Engineering (Pre-Sales) @ databricks
--------------------------------------------------------------------------------
sem=0.

In [15]:
top_lexical = sorted(
    rows,
    key=lambda r: r["lexical"],
    reverse=True
)[:10]

print("\n=== TOP 10 BY LEXICAL MATCH ===\n")

for r in top_lexical:
    job = r["job"]
    print(f"lex={r['lexical']:.3f}")
    print(f"{job.title} @ {job.company}")
    print("-" * 80)


=== TOP 10 BY LEXICAL MATCH ===

lex=0.423
Staff Machine Learning Engineer - Community Support Engineering @ airbnb
--------------------------------------------------------------------------------
lex=0.407
Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
--------------------------------------------------------------------------------
lex=0.369
Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
lex=0.357
Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
--------------------------------------------------------------------------------
lex=0.340
Senior Machine Learning Engineer, Stripe Assistant @ stripe
--------------------------------------------------------------------------------
lex=0.268
Senior Data Scientist - Inference, Global Markets @ airbnb
--------------------------------------------------------------------------------
l

In [16]:
sem_vals = np.array([r["semantic"] for r in rows])
lex_vals = np.array([r["lexical"] for r in rows])

mu_s, sd_s = sem_vals.mean(), sem_vals.std() + 1e-9
mu_l, sd_l = lex_vals.mean(), lex_vals.std() + 1e-9

In [17]:
# RRF reciprocal rank fusion

def rrf_fusion(sem_rank, lex_rank, k=60):
    return 1/(k + sem_rank) + 1/(k + lex_rank)


# rank separately
sem_sorted = sorted(rows, key=lambda r: r["semantic"], reverse=True)
lex_sorted = sorted(rows, key=lambda r: r["lexical"], reverse=True)

sem_rank = {id(r): i+1 for i, r in enumerate(sem_sorted)}
lex_rank = {id(r): i+1 for i, r in enumerate(lex_sorted)}

rrf_scored = []
for r in rows:
    score = rrf_fusion(sem_rank[id(r)], lex_rank[id(r)])
    rrf_scored.append((score, r))

rrf_scored.sort(reverse=True, key=lambda x: x[0])

print("\n=== TOP 10: RRF FUSION ===")
for s, r in rrf_scored[:10]:
    j = r["job"]
    print(f"{s:.4f} | {j.title} @ {j.company}")


=== TOP 10: RRF FUSION ===
0.0325 | Staff Machine Learning Engineer - Community Support Engineering @ airbnb
0.0325 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
0.0317 | Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
0.0312 | Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
0.0282 | Staff Software Engineer, Data Warehouse Compute @ airbnb
0.0280 | Senior Machine Learning Engineer, Stripe Assistant @ stripe
0.0280 | Senior Solutions Engineer @ databricks
0.0279 | Staff Software Engineer, Experimentation Data @ airbnb
0.0279 | Director, Engineering - Databricks Mosaic AI @ databricks
0.0277 | Senior Software Engineer - Fullstack (NYC) @ databricks


In [18]:
# Product of Experts (PoE) — “semantic with lexical a-priori”

def zsig(x, mu, sd):
    return 1 / (1 + np.exp(-(x - mu)/sd))


def poe_score(sem, lex, a=1.0, b=1.0, eps=1e-6):
    ps = zsig(sem, mu_s, sd_s)
    pl = zsig(lex, mu_l, sd_l)
    return a*np.log(ps + eps) + b*np.log(pl + eps)

poe_scored = []
for r in rows:
    s = poe_score(r["semantic"], r["lexical"], a=1.0, b=1.0)
    poe_scored.append((s, r))

poe_scored.sort(reverse=True, key=lambda x: x[0])

print("\n=== TOP 10: PRODUCT OF EXPERTS ===")
for s, r in poe_scored[:10]:
    j = r["job"]
    print(f"{s:.3f} | sem={r['semantic']:.3f} lex={r['lexical']:.3f} | {j.title}")


=== TOP 10: PRODUCT OF EXPERTS ===
-0.028 | sem=0.407 lex=0.407 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI
-0.029 | sem=0.401 lex=0.423 | Staff Machine Learning Engineer - Community Support Engineering
-0.052 | sem=0.359 lex=0.369 | Staff Machine Learning Engineer, Communication & Connectivity
-0.058 | sem=0.353 lex=0.357 | Senior Staff Machine Learning Engineer, Communication & Connectivity
-0.214 | sem=0.239 lex=0.340 | Senior Machine Learning Engineer, Stripe Assistant
-0.275 | sem=0.245 lex=0.207 | Staff Software Engineer, Experimentation Data
-0.317 | sem=0.211 lex=0.268 | Senior Data Scientist - Inference, Global Markets
-0.342 | sem=0.308 lex=0.130 | Senior Solutions Engineer
-0.346 | sem=0.248 lex=0.162 | Staff Software Engineer, Data Warehouse Compute
-0.346 | sem=0.309 lex=0.128 | Director, Engineering - Databricks Mosaic AI


In [19]:
print("\n=== SIDE-BY-SIDE COMPARISON (TOP 10 EACH) ===")
for i in range(10):
    print(f"\nRank {i+1}")
    print("SEM :", sem_sorted[i]["job"].title)
    print("LEX :", lex_sorted[i]["job"].title)
    print("RRF :", rrf_scored[i][1]["job"].title)
    print("POE :", poe_scored[i][1]["job"].title)



=== SIDE-BY-SIDE COMPARISON (TOP 10 EACH) ===

Rank 1
SEM : Staff Machine Learning Engineer, Listings and Host Tools Data and AI
LEX : Staff Machine Learning Engineer - Community Support Engineering
RRF : Staff Machine Learning Engineer - Community Support Engineering
POE : Staff Machine Learning Engineer, Listings and Host Tools Data and AI

Rank 2
SEM : Staff Machine Learning Engineer - Community Support Engineering
LEX : Staff Machine Learning Engineer, Listings and Host Tools Data and AI
RRF : Staff Machine Learning Engineer, Listings and Host Tools Data and AI
POE : Staff Machine Learning Engineer - Community Support Engineering

Rank 3
SEM : Staff Machine Learning Engineer, Communication & Connectivity
LEX : Staff Machine Learning Engineer, Communication & Connectivity
RRF : Staff Machine Learning Engineer, Communication & Connectivity
POE : Staff Machine Learning Engineer, Communication & Connectivity

Rank 4
SEM : Senior Staff Machine Learning Engineer, Communication & Connect

## Extracting Details of filtered Jobs

In [20]:
TOP_N = 10
enriched = []

for poe, r in poe_scored[:TOP_N]:
    job = r["job"]
    sem = r["semantic"]
    lex = r["lexical"]

    # 1. check cache
    full_text = store.load_job_detail(job.job_id)

    # 2. fetch + extract if missing
    if full_text is None:
        print(f"[enrich] fetching {job.job_id}")
        html = fetch_job_page(job.apply_url)
        full_text = extract_job_text(html)
        store.save_job_detail(job.job_id, full_text)

    enriched.append({
        "poe": poe,
        "semantic": sem,
        "lexical": lex,
        "job": job,
        "full_text": full_text,
    })


In [21]:
for e in enriched:
    j = e["job"]
    print(
        f"{e['poe']:+.3f} | "
        f"sem={e['semantic']:.3f} "
        f"lex={e['lexical']:.3f} | "
        f"{j.title} @ {j.company}"
    )

-0.028 | sem=0.407 lex=0.407 | Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb
-0.029 | sem=0.401 lex=0.423 | Staff Machine Learning Engineer - Community Support Engineering @ airbnb
-0.052 | sem=0.359 lex=0.369 | Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
-0.058 | sem=0.353 lex=0.357 | Senior Staff Machine Learning Engineer, Communication & Connectivity @ airbnb
-0.214 | sem=0.239 lex=0.340 | Senior Machine Learning Engineer, Stripe Assistant @ stripe
-0.275 | sem=0.245 lex=0.207 | Staff Software Engineer, Experimentation Data @ airbnb
-0.317 | sem=0.211 lex=0.268 | Senior Data Scientist - Inference, Global Markets @ airbnb
-0.342 | sem=0.308 lex=0.130 | Senior Solutions Engineer @ databricks
-0.346 | sem=0.248 lex=0.162 | Staff Software Engineer, Data Warehouse Compute @ airbnb
-0.346 | sem=0.309 lex=0.128 | Director, Engineering - Databricks Mosaic AI @ databricks


## Generate Cover Letter 

In [22]:
resume_text = parse_resume_pdf("../resume.pdf")

print(len(resume_text))
print(resume_text[:800])

5253
Manush Kalwari
manushkalwari141@gmail.com | LinkedIn:manush-kalwari | GitHub:ManushKalwari | Portfolio Website
Technologies
Machine Learning & Generative AI: PyTorch, TensorFlow, HuggingFace, scikit-learn, LangChain, LangGraph, Optimizations (LoRA, GRPO,
quantization, FlashAttention), Docker, CI/CD (GitHub Actions, GitLab), Comet ML, Weights & Biases
Cloud: AWS (S3, SageMaker, Lambda, EC2), GCP (Vertex AI, BigQuery , Cloud Run, GKE, Composer, Dataflow)
High Performance ML: CUDA, Sharding, multi-GPU training, profiling, distributed training
Education
Columbia University, MS in Electrical Engineering (Specialization in ML) Sept 2024 – Dec 2025 (Exp.)
• Coursework: Advanced Deep Learning, Scaling LLM Systems, High-Performance ML, Generative AI, Mathematics of ML
PROJECTS
ScaleRAG - Multi-Moda


In [24]:
#my_llm = LocalLLM()
llm = OpenAILLM()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
cover_letters = []

for i, e in enumerate(enriched[:1], start=1):
    job = e["job"]
    job_text = e["full_text"]

    print(f"\n[generate {i}/10] {job.title} @ {job.company}")

    cl = generate_cover_letter(
        resume_text=resume_text,
        job_text=job_text,
        company=job.company,
        role=job.title,
        llm=llm,
    )

    cover_letters.append({
        "job_id": job.job_id,
        "company": job.company,
        "title": job.title,
        "poe": e["poe"],
        "cover_letter": cl,
    })


[generate 1/10] Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb


In [42]:
job_text

"Skip to main content\nCareers\nLife at Airbnb\nJob Search\nConnect Engineering Apprenticeship\nInternship Programs\nContractor Roles\nLife at Airbnb\nJob Search\nConnect Engineering Apprenticeship\nInternship Programs\nContractor Roles\nStaff Machine Learning Engineer, Listings and Host Tools Data and AI\nUnited States\nRole overview\nApplication\nAirbnb was born in 2007 when two hosts welcomed three guests to their San Francisco home, and has since grown to over 5 million hosts who have welcomed over 2 billion guest arrivals in almost every country across the globe. Every day, hosts offer unique stays and experiences that make it possible for guests to connect with communities in a more authentic way.\nThe Community You Will Join:\nUser Listing Marketplace Intelligence Machine Learning (ULM-ML) team: The ULM-ML team supports host personalization products and provides data driven solutions to achieve superior host experience on Airbnb. These products include but are not limited to man

In [41]:
for c in cover_letters:
    print("=" * 100)
    print(f"{c['title']} @ {c['company']}  |  PoE={c['poe']:.3f}")
    print("-" * 100)
    print(c["cover_letter"])

Staff Machine Learning Engineer, Listings and Host Tools Data and AI @ airbnb  |  PoE=-0.028
----------------------------------------------------------------------------------------------------
 And Diversity
At Airbnb, we believe that diversity is essential to our success as a company and to the creation of exceptional user experiences for all people. We strive to create a workplace environment that is welcoming, respectful, and free from discrimination and harassment. Our commitment extends beyond just recruiting diverse talent. We also actively promote inclusion through employee resource groups, community outreach initiatives, and ongoing efforts to foster a culture of respect and belonging.
We encourage individuals from all backgrounds to apply and welcome feedback about how we can continue to improve our processes and policies related to diversity and inclusion. Your input is valuable to us and helps ensure that we remain committed to creating a truly inclusive organization.
Pleas

In [None]:
from pathlib import Path

out_dir = Path("final_cover_letters")
out_dir.mkdir(exist_ok=True)

for c in cover_letters:
    fname = f"{c['company']}_{c['title']}.txt".replace(" ", "_").lower()
    (out_dir / fname).write_text(c["cover_letter"], encoding="utf-8")